linux/fs/btrfs/extent-tree.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2007 Oracle.  All rights reserved.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of the GNU General Public
   6 * License v2 as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful,
   9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11 * General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public
  14 * License along with this program; if not, write to the
  15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16 * Boston, MA 021110-1307, USA.
  17 */
  18#include <linux/sched.h>
  19#include <linux/pagemap.h>
  20#include <linux/writeback.h>
  21#include <linux/blkdev.h>
  22#include <linux/sort.h>
  23#include <linux/rcupdate.h>
  24#include <linux/kthread.h>
  25#include <linux/slab.h>
  26#include <linux/ratelimit.h>
  27#include <linux/percpu_counter.h>
  28#include "hash.h"
  29#include "ctree.h"
  30#include "disk-io.h"
  31#include "print-tree.h"
  32#include "transaction.h"
  33#include "volumes.h"
  34#include "raid56.h"
  35#include "locking.h"
  36#include "free-space-cache.h"
  37#include "math.h"
  38#include "sysfs.h"
  39
  40#undef SCRAMBLE_DELAYED_REFS
  41
  42/*
  43 * control flags for do_chunk_alloc's force field
  44 * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
  45 * if we really need one.
  46 *
  47 * CHUNK_ALLOC_LIMITED means to only try and allocate one
  48 * if we have very few chunks already allocated.  This is
  49 * used as part of the clustering code to help make sure
  50 * we have a good pool of storage to cluster in, without
  51 * filling the FS with empty chunks
  52 *
  53 * CHUNK_ALLOC_FORCE means it must try to allocate one
  54 *
  55 */
  56enum {
  57        CHUNK_ALLOC_NO_FORCE = 0,
  58        CHUNK_ALLOC_LIMITED = 1,
  59        CHUNK_ALLOC_FORCE = 2,
  60};
  61
  62/*
  63 * Control how reservations are dealt with.
  64 *
  65 * RESERVE_FREE - freeing a reservation.
  66 * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for
  67 *   ENOSPC accounting
  68 * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update
  69 *   bytes_may_use as the ENOSPC accounting is done elsewhere
  70 */
  71enum {
  72        RESERVE_FREE = 0,
  73        RESERVE_ALLOC = 1,
  74        RESERVE_ALLOC_NO_ACCOUNT = 2,
  75};
  76
  77static int update_block_group(struct btrfs_root *root,
  78                              u64 bytenr, u64 num_bytes, int alloc);
  79static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
  80                                struct btrfs_root *root,
  81                                u64 bytenr, u64 num_bytes, u64 parent,
  82                                u64 root_objectid, u64 owner_objectid,
  83                                u64 owner_offset, int refs_to_drop,
  84                                struct btrfs_delayed_extent_op *extra_op);
  85static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
  86                                    struct extent_buffer *leaf,
  87                                    struct btrfs_extent_item *ei);
  88static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
  89                                      struct btrfs_root *root,
  90                                      u64 parent, u64 root_objectid,
  91                                      u64 flags, u64 owner, u64 offset,
  92                                      struct btrfs_key *ins, int ref_mod);
  93static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
  94                                     struct btrfs_root *root,
  95                                     u64 parent, u64 root_objectid,
  96                                     u64 flags, struct btrfs_disk_key *key,
  97                                     int level, struct btrfs_key *ins);
  98static int do_chunk_alloc(struct btrfs_trans_handle *trans,
  99                          struct btrfs_root *extent_root, u64 flags,
 100                          int force);
 101static int find_next_key(struct btrfs_path *path, int level,
 102                         struct btrfs_key *key);
 103static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
 104                            int dump_block_groups);
 105static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
 106                                       u64 num_bytes, int reserve);
 107static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
 108                               u64 num_bytes);
 109int btrfs_pin_extent(struct btrfs_root *root,
 110                     u64 bytenr, u64 num_bytes, int reserved);
 111
 112static noinline int
 113block_group_cache_done(struct btrfs_block_group_cache *cache)
 114{
 115        smp_mb();
 116        return cache->cached == BTRFS_CACHE_FINISHED ||
 117                cache->cached == BTRFS_CACHE_ERROR;
 118}
 119
 120static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
 121{
 122        return (cache->flags & bits) == bits;
 123}
 124
 125static void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
 126{
 127        atomic_inc(&cache->count);
 128}
 129
 130void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
 131{
 132        if (atomic_dec_and_test(&cache->count)) {
 133                WARN_ON(cache->pinned > 0);
 134                WARN_ON(cache->reserved > 0);
 135                kfree(cache->free_space_ctl);
 136                kfree(cache);
 137        }
 138}
 139
 140/*
 141 * this adds the block group to the fs_info rb tree for the block group
 142 * cache
 143 */
 144static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
 145                                struct btrfs_block_group_cache *block_group)
 146{
 147        struct rb_node **p;
 148        struct rb_node *parent = NULL;
 149        struct btrfs_block_group_cache *cache;
 150
 151        spin_lock(&info->block_group_cache_lock);
 152        p = &info->block_group_cache_tree.rb_node;
 153
 154        while (*p) {
 155                parent = *p;
 156                cache = rb_entry(parent, struct btrfs_block_group_cache,
 157                                 cache_node);
 158                if (block_group->key.objectid < cache->key.objectid) {
 159                        p = &(*p)->rb_left;
 160                } else if (block_group->key.objectid > cache->key.objectid) {
 161                        p = &(*p)->rb_right;
 162                } else {
 163                        spin_unlock(&info->block_group_cache_lock);
 164                        return -EEXIST;
 165                }
 166        }
 167
 168        rb_link_node(&block_group->cache_node, parent, p);
 169        rb_insert_color(&block_group->cache_node,
 170                        &info->block_group_cache_tree);
 171
 172        if (info->first_logical_byte > block_group->key.objectid)
 173                info->first_logical_byte = block_group->key.objectid;
 174
 175        spin_unlock(&info->block_group_cache_lock);
 176
 177        return 0;
 178}
 179
 180/*
 181 * This will return the block group at or after bytenr if contains is 0, else
 182 * it will return the block group that contains the bytenr
 183 */
 184static struct btrfs_block_group_cache *
 185block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
 186                              int contains)
 187{
 188        struct btrfs_block_group_cache *cache, *ret = NULL;
 189        struct rb_node *n;
 190        u64 end, start;
 191
 192        spin_lock(&info->block_group_cache_lock);
 193        n = info->block_group_cache_tree.rb_node;
 194
 195        while (n) {
 196                cache = rb_entry(n, struct btrfs_block_group_cache,
 197                                 cache_node);
 198                end = cache->key.objectid + cache->key.offset - 1;
 199                start = cache->key.objectid;
 200
 201                if (bytenr < start) {
 202                        if (!contains && (!ret || start < ret->key.objectid))
 203                                ret = cache;
 204                        n = n->rb_left;
 205                } else if (bytenr > start) {
 206                        if (contains && bytenr <= end) {
 207                                ret = cache;
 208                                break;
 209                        }
 210                        n = n->rb_right;
 211                } else {
 212                        ret = cache;
 213                        break;
 214                }
 215        }
 216        if (ret) {
 217                btrfs_get_block_group(ret);
 218                if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
 219                        info->first_logical_byte = ret->key.objectid;
 220        }
 221        spin_unlock(&info->block_group_cache_lock);
 222
 223        return ret;
 224}
 225
 226static int add_excluded_extent(struct btrfs_root *root,
 227                               u64 start, u64 num_bytes)
 228{
 229        u64 end = start + num_bytes - 1;
 230        set_extent_bits(&root->fs_info->freed_extents[0],
 231                        start, end, EXTENT_UPTODATE, GFP_NOFS);
 232        set_extent_bits(&root->fs_info->freed_extents[1],
 233                        start, end, EXTENT_UPTODATE, GFP_NOFS);
 234        return 0;
 235}
 236
 237static void free_excluded_extents(struct btrfs_root *root,
 238                                  struct btrfs_block_group_cache *cache)
 239{
 240        u64 start, end;
 241
 242        start = cache->key.objectid;
 243        end = start + cache->key.offset - 1;
 244
 245        clear_extent_bits(&root->fs_info->freed_extents[0],
 246                          start, end, EXTENT_UPTODATE, GFP_NOFS);
 247        clear_extent_bits(&root->fs_info->freed_extents[1],
 248                          start, end, EXTENT_UPTODATE, GFP_NOFS);
 249}
 250
 251static int exclude_super_stripes(struct btrfs_root *root,
 252                                 struct btrfs_block_group_cache *cache)
 253{
 254        u64 bytenr;
 255        u64 *logical;
 256        int stripe_len;
 257        int i, nr, ret;
 258
 259        if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
 260                stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
 261                cache->bytes_super += stripe_len;
 262                ret = add_excluded_extent(root, cache->key.objectid,
 263                                          stripe_len);
 264                if (ret)
 265                        return ret;
 266        }
 267
 268        for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
 269                bytenr = btrfs_sb_offset(i);
 270                ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
 271                                       cache->key.objectid, bytenr,
 272                                       0, &logical, &nr, &stripe_len);
 273                if (ret)
 274                        return ret;
 275
 276                while (nr--) {
 277                        u64 start, len;
 278
 279                        if (logical[nr] > cache->key.objectid +
 280                            cache->key.offset)
 281                                continue;
 282
 283                        if (logical[nr] + stripe_len <= cache->key.objectid)
 284                                continue;
 285
 286                        start = logical[nr];
 287                        if (start < cache->key.objectid) {
 288                                start = cache->key.objectid;
 289                                len = (logical[nr] + stripe_len) - start;
 290                        } else {
 291                                len = min_t(u64, stripe_len,
 292                                            cache->key.objectid +
 293                                            cache->key.offset - start);
 294                        }
 295
 296                        cache->bytes_super += len;
 297                        ret = add_excluded_extent(root, start, len);
 298                        if (ret) {
 299                                kfree(logical);
 300                                return ret;
 301                        }
 302                }
 303
 304                kfree(logical);
 305        }
 306        return 0;
 307}
 308
 309static struct btrfs_caching_control *
 310get_caching_control(struct btrfs_block_group_cache *cache)
 311{
 312        struct btrfs_caching_control *ctl;
 313
 314        spin_lock(&cache->lock);
 315        if (cache->cached != BTRFS_CACHE_STARTED) {
 316                spin_unlock(&cache->lock);
 317                return NULL;
 318        }
 319
 320        /* We're loading it the fast way, so we don't have a caching_ctl. */
 321        if (!cache->caching_ctl) {
 322                spin_unlock(&cache->lock);
 323                return NULL;
 324        }
 325
 326        ctl = cache->caching_ctl;
 327        atomic_inc(&ctl->count);
 328        spin_unlock(&cache->lock);
 329        return ctl;
 330}
 331
 332static void put_caching_control(struct btrfs_caching_control *ctl)
 333{
 334        if (atomic_dec_and_test(&ctl->count))
 335                kfree(ctl);
 336}
 337
 338/*
 339 * this is only called by cache_block_group, since we could have freed extents
 340 * we need to check the pinned_extents for any extents that can't be used yet
 341 * since their free space will be released as soon as the transaction commits.
 342 */
 343static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
 344                              struct btrfs_fs_info *info, u64 start, u64 end)
 345{
 346        u64 extent_start, extent_end, size, total_added = 0;
 347        int ret;
 348
 349        while (start < end) {
 350                ret = find_first_extent_bit(info->pinned_extents, start,
 351                                            &extent_start, &extent_end,
 352                                            EXTENT_DIRTY | EXTENT_UPTODATE,
 353                                            NULL);
 354                if (ret)
 355                        break;
 356
 357                if (extent_start <= start) {
 358                        start = extent_end + 1;
 359                } else if (extent_start > start && extent_start < end) {
 360                        size = extent_start - start;
 361                        total_added += size;
 362                        ret = btrfs_add_free_space(block_group, start,
 363                                                   size);
 364                        BUG_ON(ret); /* -ENOMEM or logic error */
 365                        start = extent_end + 1;
 366                } else {
 367                        break;
 368                }
 369        }
 370
 371        if (start < end) {
 372                size = end - start;
 373                total_added += size;
 374                ret = btrfs_add_free_space(block_group, start, size);
 375                BUG_ON(ret); /* -ENOMEM or logic error */
 376        }
 377
 378        return total_added;
 379}
 380
 381static noinline void caching_thread(struct btrfs_work *work)
 382{
 383        struct btrfs_block_group_cache *block_group;
 384        struct btrfs_fs_info *fs_info;
 385        struct btrfs_caching_control *caching_ctl;
 386        struct btrfs_root *extent_root;
 387        struct btrfs_path *path;
 388        struct extent_buffer *leaf;
 389        struct btrfs_key key;
 390        u64 total_found = 0;
 391        u64 last = 0;
 392        u32 nritems;
 393        int ret = -ENOMEM;
 394
 395        caching_ctl = container_of(work, struct btrfs_caching_control, work);
 396        block_group = caching_ctl->block_group;
 397        fs_info = block_group->fs_info;
 398        extent_root = fs_info->extent_root;
 399
 400        path = btrfs_alloc_path();
 401        if (!path)
 402                goto out;
 403
 404        last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
 405
 406        /*
 407         * We don't want to deadlock with somebody trying to allocate a new
 408         * extent for the extent root while also trying to search the extent
 409         * root to add free space.  So we skip locking and search the commit
 410         * root, since its read-only
 411         */
 412        path->skip_locking = 1;
 413        path->search_commit_root = 1;
 414        path->reada = 1;
 415
 416        key.objectid = last;
 417        key.offset = 0;
 418        key.type = BTRFS_EXTENT_ITEM_KEY;
 419again:
 420        mutex_lock(&caching_ctl->mutex);
 421        /* need to make sure the commit_root doesn't disappear */
 422        down_read(&fs_info->commit_root_sem);
 423
 424next:
 425        ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
 426        if (ret < 0)
 427                goto err;
 428
 429        leaf = path->nodes[0];
 430        nritems = btrfs_header_nritems(leaf);
 431
 432        while (1) {
 433                if (btrfs_fs_closing(fs_info) > 1) {
 434                        last = (u64)-1;
 435                        break;
 436                }
 437
 438                if (path->slots[0] < nritems) {
 439                        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 440                } else {
 441                        ret = find_next_key(path, 0, &key);
 442                        if (ret)
 443                                break;
 444
 445                        if (need_resched() ||
 446                            rwsem_is_contended(&fs_info->commit_root_sem)) {
 447                                caching_ctl->progress = last;
 448                                btrfs_release_path(path);
 449                                up_read(&fs_info->commit_root_sem);
 450                                mutex_unlock(&caching_ctl->mutex);
 451                                cond_resched();
 452                                goto again;
 453                        }
 454
 455                        ret = btrfs_next_leaf(extent_root, path);
 456                        if (ret < 0)
 457                                goto err;
 458                        if (ret)
 459                                break;
 460                        leaf = path->nodes[0];
 461                        nritems = btrfs_header_nritems(leaf);
 462                        continue;
 463                }
 464
 465                if (key.objectid < last) {
 466                        key.objectid = last;
 467                        key.offset = 0;
 468                        key.type = BTRFS_EXTENT_ITEM_KEY;
 469
 470                        caching_ctl->progress = last;
 471                        btrfs_release_path(path);
 472                        goto next;
 473                }
 474
 475                if (key.objectid < block_group->key.objectid) {
 476                        path->slots[0]++;
 477                        continue;
 478                }
 479
 480                if (key.objectid >= block_group->key.objectid +
 481                    block_group->key.offset)
 482                        break;
 483
 484                if (key.type == BTRFS_EXTENT_ITEM_KEY ||
 485                    key.type == BTRFS_METADATA_ITEM_KEY) {
 486                        total_found += add_new_free_space(block_group,
 487                                                          fs_info, last,
 488                                                          key.objectid);
 489                        if (key.type == BTRFS_METADATA_ITEM_KEY)
 490                                last = key.objectid +
 491                                        fs_info->tree_root->leafsize;
 492                        else
 493                                last = key.objectid + key.offset;
 494
 495                        if (total_found > (1024 * 1024 * 2)) {
 496                                total_found = 0;
 497                                wake_up(&caching_ctl->wait);
 498                        }
 499                }
 500                path->slots[0]++;
 501        }
 502        ret = 0;
 503
 504        total_found += add_new_free_space(block_group, fs_info, last,
 505                                          block_group->key.objectid +
 506                                          block_group->key.offset);
 507        caching_ctl->progress = (u64)-1;
 508
 509        spin_lock(&block_group->lock);
 510        block_group->caching_ctl = NULL;
 511        block_group->cached = BTRFS_CACHE_FINISHED;
 512        spin_unlock(&block_group->lock);
 513
 514err:
 515        btrfs_free_path(path);
 516        up_read(&fs_info->commit_root_sem);
 517
 518        free_excluded_extents(extent_root, block_group);
 519
 520        mutex_unlock(&caching_ctl->mutex);
 521out:
 522        if (ret) {
 523                spin_lock(&block_group->lock);
 524                block_group->caching_ctl = NULL;
 525                block_group->cached = BTRFS_CACHE_ERROR;
 526                spin_unlock(&block_group->lock);
 527        }
 528        wake_up(&caching_ctl->wait);
 529
 530        put_caching_control(caching_ctl);
 531        btrfs_put_block_group(block_group);
 532}
 533
 534static int cache_block_group(struct btrfs_block_group_cache *cache,
 535                             int load_cache_only)
 536{
 537        DEFINE_WAIT(wait);
 538        struct btrfs_fs_info *fs_info = cache->fs_info;
 539        struct btrfs_caching_control *caching_ctl;
 540        int ret = 0;
 541
 542        caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
 543        if (!caching_ctl)
 544                return -ENOMEM;
 545
 546        INIT_LIST_HEAD(&caching_ctl->list);
 547        mutex_init(&caching_ctl->mutex);
 548        init_waitqueue_head(&caching_ctl->wait);
 549        caching_ctl->block_group = cache;
 550        caching_ctl->progress = cache->key.objectid;
 551        atomic_set(&caching_ctl->count, 1);
 552        btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
 553
 554        spin_lock(&cache->lock);
 555        /*
 556         * This should be a rare occasion, but this could happen I think in the
 557         * case where one thread starts to load the space cache info, and then
 558         * some other thread starts a transaction commit which tries to do an
 559         * allocation while the other thread is still loading the space cache
 560         * info.  The previous loop should have kept us from choosing this block
 561         * group, but if we've moved to the state where we will wait on caching
 562         * block groups we need to first check if we're doing a fast load here,
 563         * so we can wait for it to finish, otherwise we could end up allocating
 564         * from a block group who's cache gets evicted for one reason or
 565         * another.
 566         */
 567        while (cache->cached == BTRFS_CACHE_FAST) {
 568                struct btrfs_caching_control *ctl;
 569
 570                ctl = cache->caching_ctl;
 571                atomic_inc(&ctl->count);
 572                prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
 573                spin_unlock(&cache->lock);
 574
 575                schedule();
 576
 577                finish_wait(&ctl->wait, &wait);
 578                put_caching_control(ctl);
 579                spin_lock(&cache->lock);
 580        }
 581
 582        if (cache->cached != BTRFS_CACHE_NO) {
 583                spin_unlock(&cache->lock);
 584                kfree(caching_ctl);
 585                return 0;
 586        }
 587        WARN_ON(cache->caching_ctl);
 588        cache->caching_ctl = caching_ctl;
 589        cache->cached = BTRFS_CACHE_FAST;
 590        spin_unlock(&cache->lock);
 591
 592        if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
 593                ret = load_free_space_cache(fs_info, cache);
 594
 595                spin_lock(&cache->lock);
 596                if (ret == 1) {
 597                        cache->caching_ctl = NULL;
 598                        cache->cached = BTRFS_CACHE_FINISHED;
 599                        cache->last_byte_to_unpin = (u64)-1;
 600                } else {
 601                        if (load_cache_only) {
 602                                cache->caching_ctl = NULL;
 603                                cache->cached = BTRFS_CACHE_NO;
 604                        } else {
 605                                cache->cached = BTRFS_CACHE_STARTED;
 606                        }
 607                }
 608                spin_unlock(&cache->lock);
 609                wake_up(&caching_ctl->wait);
 610                if (ret == 1) {
 611                        put_caching_control(caching_ctl);
 612                        free_excluded_extents(fs_info->extent_root, cache);
 613                        return 0;
 614                }
 615        } else {
 616                /*
 617                 * We are not going to do the fast caching, set cached to the
 618                 * appropriate value and wakeup any waiters.
 619                 */
 620                spin_lock(&cache->lock);
 621                if (load_cache_only) {
 622                        cache->caching_ctl = NULL;
 623                        cache->cached = BTRFS_CACHE_NO;
 624                } else {
 625                        cache->cached = BTRFS_CACHE_STARTED;
 626                }
 627                spin_unlock(&cache->lock);
 628                wake_up(&caching_ctl->wait);
 629        }
 630
 631        if (load_cache_only) {
 632                put_caching_control(caching_ctl);
 633                return 0;
 634        }
 635
 636        down_write(&fs_info->commit_root_sem);
 637        atomic_inc(&caching_ctl->count);
 638        list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
 639        up_write(&fs_info->commit_root_sem);
 640
 641        btrfs_get_block_group(cache);
 642
 643        btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
 644
 645        return ret;
 646}
 647
 648/*
 649 * return the block group that starts at or after bytenr
 650 */
 651static struct btrfs_block_group_cache *
 652btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
 653{
 654        struct btrfs_block_group_cache *cache;
 655
 656        cache = block_group_cache_tree_search(info, bytenr, 0);
 657
 658        return cache;
 659}
 660
 661/*
 662 * return the block group that contains the given bytenr
 663 */
 664struct btrfs_block_group_cache *btrfs_lookup_block_group(
 665                                                 struct btrfs_fs_info *info,
 666                                                 u64 bytenr)
 667{
 668        struct btrfs_block_group_cache *cache;
 669
 670        cache = block_group_cache_tree_search(info, bytenr, 1);
 671
 672        return cache;
 673}
 674
 675static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
 676                                                  u64 flags)
 677{
 678        struct list_head *head = &info->space_info;
 679        struct btrfs_space_info *found;
 680
 681        flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
 682
 683        rcu_read_lock();
 684        list_for_each_entry_rcu(found, head, list) {
 685                if (found->flags & flags) {
 686                        rcu_read_unlock();
 687                        return found;
 688                }
 689        }
 690        rcu_read_unlock();
 691        return NULL;
 692}
 693
 694/*
 695 * after adding space to the filesystem, we need to clear the full flags
 696 * on all the space infos.
 697 */
 698void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
 699{
 700        struct list_head *head = &info->space_info;
 701        struct btrfs_space_info *found;
 702
 703        rcu_read_lock();
 704        list_for_each_entry_rcu(found, head, list)
 705                found->full = 0;
 706        rcu_read_unlock();
 707}
 708
 709/* simple helper to search for an existing extent at a given offset */
 710int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
 711{
 712        int ret;
 713        struct btrfs_key key;
 714        struct btrfs_path *path;
 715
 716        path = btrfs_alloc_path();
 717        if (!path)
 718                return -ENOMEM;
 719
 720        key.objectid = start;
 721        key.offset = len;
 722        key.type = BTRFS_EXTENT_ITEM_KEY;
 723        ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
 724                                0, 0);
 725        if (ret > 0) {
 726                btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
 727                if (key.objectid == start &&
 728                    key.type == BTRFS_METADATA_ITEM_KEY)
 729                        ret = 0;
 730        }
 731        btrfs_free_path(path);
 732        return ret;
 733}
 734
 735/*
 736 * helper function to lookup reference count and flags of a tree block.
 737 *
 738 * the head node for delayed ref is used to store the sum of all the
 739 * reference count modifications queued up in the rbtree. the head
 740 * node may also store the extent flags to set. This way you can check
 741 * to see what the reference count and extent flags would be if all of
 742 * the delayed refs are not processed.
 743 */
 744int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
 745                             struct btrfs_root *root, u64 bytenr,
 746                             u64 offset, int metadata, u64 *refs, u64 *flags)
 747{
 748        struct btrfs_delayed_ref_head *head;
 749        struct btrfs_delayed_ref_root *delayed_refs;
 750        struct btrfs_path *path;
 751        struct btrfs_extent_item *ei;
 752        struct extent_buffer *leaf;
 753        struct btrfs_key key;
 754        u32 item_size;
 755        u64 num_refs;
 756        u64 extent_flags;
 757        int ret;
 758
 759        /*
 760         * If we don't have skinny metadata, don't bother doing anything
 761         * different
 762         */
 763        if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) {
 764                offset = root->leafsize;
 765                metadata = 0;
 766        }
 767
 768        path = btrfs_alloc_path();
 769        if (!path)
 770                return -ENOMEM;
 771
 772        if (!trans) {
 773                path->skip_locking = 1;
 774                path->search_commit_root = 1;
 775        }
 776
 777search_again:
 778        key.objectid = bytenr;
 779        key.offset = offset;
 780        if (metadata)
 781                key.type = BTRFS_METADATA_ITEM_KEY;
 782        else
 783                key.type = BTRFS_EXTENT_ITEM_KEY;
 784
 785again:
 786        ret = btrfs_search_slot(trans, root->fs_info->extent_root,
 787                                &key, path, 0, 0);
 788        if (ret < 0)
 789                goto out_free;
 790
 791        if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
 792                if (path->slots[0]) {
 793                        path->slots[0]--;
 794                        btrfs_item_key_to_cpu(path->nodes[0], &key,
 795                                              path->slots[0]);
 796                        if (key.objectid == bytenr &&
 797                            key.type == BTRFS_EXTENT_ITEM_KEY &&
 798                            key.offset == root->leafsize)
 799                                ret = 0;
 800                }
 801                if (ret) {
 802                        key.objectid = bytenr;
 803                        key.type = BTRFS_EXTENT_ITEM_KEY;
 804                        key.offset = root->leafsize;
 805                        btrfs_release_path(path);
 806                        goto again;
 807                }
 808        }
 809
 810        if (ret == 0) {
 811                leaf = path->nodes[0];
 812                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
 813                if (item_size >= sizeof(*ei)) {
 814                        ei = btrfs_item_ptr(leaf, path->slots[0],
 815                                            struct btrfs_extent_item);
 816                        num_refs = btrfs_extent_refs(leaf, ei);
 817                        extent_flags = btrfs_extent_flags(leaf, ei);
 818                } else {
 819#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
 820                        struct btrfs_extent_item_v0 *ei0;
 821                        BUG_ON(item_size != sizeof(*ei0));
 822                        ei0 = btrfs_item_ptr(leaf, path->slots[0],
 823                                             struct btrfs_extent_item_v0);
 824                        num_refs = btrfs_extent_refs_v0(leaf, ei0);
 825                        /* FIXME: this isn't correct for data */
 826                        extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
 827#else
 828                        BUG();
 829#endif
 830                }
 831                BUG_ON(num_refs == 0);
 832        } else {
 833                num_refs = 0;
 834                extent_flags = 0;
 835                ret = 0;
 836        }
 837
 838        if (!trans)
 839                goto out;
 840
 841        delayed_refs = &trans->transaction->delayed_refs;
 842        spin_lock(&delayed_refs->lock);
 843        head = btrfs_find_delayed_ref_head(trans, bytenr);
 844        if (head) {
 845                if (!mutex_trylock(&head->mutex)) {
 846                        atomic_inc(&head->node.refs);
 847                        spin_unlock(&delayed_refs->lock);
 848
 849                        btrfs_release_path(path);
 850
 851                        /*
 852                         * Mutex was contended, block until it's released and try
 853                         * again
 854                         */
 855                        mutex_lock(&head->mutex);
 856                        mutex_unlock(&head->mutex);
 857                        btrfs_put_delayed_ref(&head->node);
 858                        goto search_again;
 859                }
 860                spin_lock(&head->lock);
 861                if (head->extent_op && head->extent_op->update_flags)
 862                        extent_flags |= head->extent_op->flags_to_set;
 863                else
 864                        BUG_ON(num_refs == 0);
 865
 866                num_refs += head->node.ref_mod;
 867                spin_unlock(&head->lock);
 868                mutex_unlock(&head->mutex);
 869        }
 870        spin_unlock(&delayed_refs->lock);
 871out:
 872        WARN_ON(num_refs == 0);
 873        if (refs)
 874                *refs = num_refs;
 875        if (flags)
 876                *flags = extent_flags;
 877out_free:
 878        btrfs_free_path(path);
 879        return ret;
 880}
 881
 882/*
 883 * Back reference rules.  Back refs have three main goals:
 884 *
 885 * 1) differentiate between all holders of references to an extent so that
 886 *    when a reference is dropped we can make sure it was a valid reference
 887 *    before freeing the extent.
 888 *
 889 * 2) Provide enough information to quickly find the holders of an extent
 890 *    if we notice a given block is corrupted or bad.
 891 *
 892 * 3) Make it easy to migrate blocks for FS shrinking or storage pool
 893 *    maintenance.  This is actually the same as #2, but with a slightly
 894 *    different use case.
 895 *
 896 * There are two kinds of back refs. The implicit back refs is optimized
 897 * for pointers in non-shared tree blocks. For a given pointer in a block,
 898 * back refs of this kind provide information about the block's owner tree
 899 * and the pointer's key. These information allow us to find the block by
 900 * b-tree searching. The full back refs is for pointers in tree blocks not
 901 * referenced by their owner trees. The location of tree block is recorded
 902 * in the back refs. Actually the full back refs is generic, and can be
 903 * used in all cases the implicit back refs is used. The major shortcoming
 904 * of the full back refs is its overhead. Every time a tree block gets
 905 * COWed, we have to update back refs entry for all pointers in it.
 906 *
 907 * For a newly allocated tree block, we use implicit back refs for
 908 * pointers in it. This means most tree related operations only involve
 909 * implicit back refs. For a tree block created in old transaction, the
 910 * only way to drop a reference to it is COW it. So we can detect the
 911 * event that tree block loses its owner tree's reference and do the
 912 * back refs conversion.
 913 *
 914 * When a tree block is COW'd through a tree, there are four cases:
 915 *
 916 * The reference count of the block is one and the tree is the block's
 917 * owner tree. Nothing to do in this case.
 918 *
 919 * The reference count of the block is one and the tree is not the
 920 * block's owner tree. In this case, full back refs is used for pointers
 921 * in the block. Remove these full back refs, add implicit back refs for
 922 * every pointers in the new block.
 923 *
 924 * The reference count of the block is greater than one and the tree is
 925 * the block's owner tree. In this case, implicit back refs is used for
 926 * pointers in the block. Add full back refs for every pointers in the
 927 * block, increase lower level extents' reference counts. The original
 928 * implicit back refs are entailed to the new block.
 929 *
 930 * The reference count of the block is greater than one and the tree is
 931 * not the block's owner tree. Add implicit back refs for every pointer in
 932 * the new block, increase lower level extents' reference count.
 933 *
 934 * Back Reference Key composing:
 935 *
 936 * The key objectid corresponds to the first byte in the extent,
 937 * The key type is used to differentiate between types of back refs.
 938 * There are different meanings of the key offset for different types
 939 * of back refs.
 940 *
 941 * File extents can be referenced by:
 942 *
 943 * - multiple snapshots, subvolumes, or different generations in one subvol
 944 * - different files inside a single subvolume
 945 * - different offsets inside a file (bookend extents in file.c)
 946 *
 947 * The extent ref structure for the implicit back refs has fields for:
 948 *
 949 * - Objectid of the subvolume root
 950 * - objectid of the file holding the reference
 951 * - original offset in the file
 952 * - how many bookend extents
 953 *
 954 * The key offset for the implicit back refs is hash of the first
 955 * three fields.
 956 *
 957 * The extent ref structure for the full back refs has field for:
 958 *
 959 * - number of pointers in the tree leaf
 960 *
 961 * The key offset for the implicit back refs is the first byte of
 962 * the tree leaf
 963 *
 964 * When a file extent is allocated, The implicit back refs is used.
 965 * the fields are filled in:
 966 *
 967 *     (root_key.objectid, inode objectid, offset in file, 1)
 968 *
 969 * When a file extent is removed file truncation, we find the
 970 * corresponding implicit back refs and check the following fields:
 971 *
 972 *     (btrfs_header_owner(leaf), inode objectid, offset in file)
 973 *
 974 * Btree extents can be referenced by:
 975 *
 976 * - Different subvolumes
 977 *
 978 * Both the implicit back refs and the full back refs for tree blocks
 979 * only consist of key. The key offset for the implicit back refs is
 980 * objectid of block's owner tree. The key offset for the full back refs
 981 * is the first byte of parent block.
 982 *
 983 * When implicit back refs is used, information about the lowest key and
 984 * level of the tree block are required. These information are stored in
 985 * tree block info structure.
 986 */
 987
 988#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
 989static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
 990                                  struct btrfs_root *root,
 991                                  struct btrfs_path *path,
 992                                  u64 owner, u32 extra_size)
 993{
 994        struct btrfs_extent_item *item;
 995        struct btrfs_extent_item_v0 *ei0;
 996        struct btrfs_extent_ref_v0 *ref0;
 997        struct btrfs_tree_block_info *bi;
 998        struct extent_buffer *leaf;
 999        struct btrfs_key key;
1000        struct btrfs_key found_key;
1001        u32 new_size = sizeof(*item);
1002        u64 refs;
1003        int ret;
1004
1005        leaf = path->nodes[0];
1006        BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
1007
1008        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1009        ei0 = btrfs_item_ptr(leaf, path->slots[0],
1010                             struct btrfs_extent_item_v0);
1011        refs = btrfs_extent_refs_v0(leaf, ei0);
1012
1013        if (owner == (u64)-1) {
1014                while (1) {
1015                        if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1016                                ret = btrfs_next_leaf(root, path);
1017                                if (ret < 0)
1018                                        return ret;
1019                                BUG_ON(ret > 0); /* Corruption */
1020                                leaf = path->nodes[0];
1021                        }
1022                        btrfs_item_key_to_cpu(leaf, &found_key,
1023                                              path->slots[0]);
1024                        BUG_ON(key.objectid != found_key.objectid);
1025                        if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
1026                                path->slots[0]++;
1027                                continue;
1028                        }
1029                        ref0 = btrfs_item_ptr(leaf, path->slots[0],
1030                                              struct btrfs_extent_ref_v0);
1031                        owner = btrfs_ref_objectid_v0(leaf, ref0);
1032                        break;
1033                }
1034        }
1035        btrfs_release_path(path);
1036
1037        if (owner < BTRFS_FIRST_FREE_OBJECTID)
1038                new_size += sizeof(*bi);
1039
1040        new_size -= sizeof(*ei0);
1041        ret = btrfs_search_slot(trans, root, &key, path,
1042                                new_size + extra_size, 1);
1043        if (ret < 0)
1044                return ret;
1045        BUG_ON(ret); /* Corruption */
1046
1047        btrfs_extend_item(root, path, new_size);
1048
1049        leaf = path->nodes[0];
1050        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1051        btrfs_set_extent_refs(leaf, item, refs);
1052        /* FIXME: get real generation */
1053        btrfs_set_extent_generation(leaf, item, 0);
1054        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1055                btrfs_set_extent_flags(leaf, item,
1056                                       BTRFS_EXTENT_FLAG_TREE_BLOCK |
1057                                       BTRFS_BLOCK_FLAG_FULL_BACKREF);
1058                bi = (struct btrfs_tree_block_info *)(item + 1);
1059                /* FIXME: get first key of the block */
1060                memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi));
1061                btrfs_set_tree_block_level(leaf, bi, (int)owner);
1062        } else {
1063                btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
1064        }
1065        btrfs_mark_buffer_dirty(leaf);
1066        return 0;
1067}
1068#endif
1069
1070static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
1071{
1072        u32 high_crc = ~(u32)0;
1073        u32 low_crc = ~(u32)0;
1074        __le64 lenum;
1075
1076        lenum = cpu_to_le64(root_objectid);
1077        high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
1078        lenum = cpu_to_le64(owner);
1079        low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
1080        lenum = cpu_to_le64(offset);
1081        low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
1082
1083        return ((u64)high_crc << 31) ^ (u64)low_crc;
1084}
1085
1086static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
1087                                     struct btrfs_extent_data_ref *ref)
1088{
1089        return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
1090                                    btrfs_extent_data_ref_objectid(leaf, ref),
1091                                    btrfs_extent_data_ref_offset(leaf, ref));
1092}
1093
1094static int match_extent_data_ref(struct extent_buffer *leaf,
1095                                 struct btrfs_extent_data_ref *ref,
1096                                 u64 root_objectid, u64 owner, u64 offset)
1097{
1098        if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
1099            btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
1100            btrfs_extent_data_ref_offset(leaf, ref) != offset)
1101                return 0;
1102        return 1;
1103}
1104
1105static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
1106                                           struct btrfs_root *root,
1107                                           struct btrfs_path *path,
1108                                           u64 bytenr, u64 parent,
1109                                           u64 root_objectid,
1110                                           u64 owner, u64 offset)
1111{
1112        struct btrfs_key key;
1113        struct btrfs_extent_data_ref *ref;
1114        struct extent_buffer *leaf;
1115        u32 nritems;
1116        int ret;
1117        int recow;
1118        int err = -ENOENT;
1119
1120        key.objectid = bytenr;
1121        if (parent) {
1122                key.type = BTRFS_SHARED_DATA_REF_KEY;
1123                key.offset = parent;
1124        } else {
1125                key.type = BTRFS_EXTENT_DATA_REF_KEY;
1126                key.offset = hash_extent_data_ref(root_objectid,
1127                                                  owner, offset);
1128        }
1129again:
1130        recow = 0;
1131        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1132        if (ret < 0) {
1133                err = ret;
1134                goto fail;
1135        }
1136
1137        if (parent) {
1138                if (!ret)
1139                        return 0;
1140#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1141                key.type = BTRFS_EXTENT_REF_V0_KEY;
1142                btrfs_release_path(path);
1143                ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1144                if (ret < 0) {
1145                        err = ret;
1146                        goto fail;
1147                }
1148                if (!ret)
1149                        return 0;
1150#endif
1151                goto fail;
1152        }
1153
1154        leaf = path->nodes[0];
1155        nritems = btrfs_header_nritems(leaf);
1156        while (1) {
1157                if (path->slots[0] >= nritems) {
1158                        ret = btrfs_next_leaf(root, path);
1159                        if (ret < 0)
1160                                err = ret;
1161                        if (ret)
1162                                goto fail;
1163
1164                        leaf = path->nodes[0];
1165                        nritems = btrfs_header_nritems(leaf);
1166                        recow = 1;
1167                }
1168
1169                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1170                if (key.objectid != bytenr ||
1171                    key.type != BTRFS_EXTENT_DATA_REF_KEY)
1172                        goto fail;
1173
1174                ref = btrfs_item_ptr(leaf, path->slots[0],
1175                                     struct btrfs_extent_data_ref);
1176
1177                if (match_extent_data_ref(leaf, ref, root_objectid,
1178                                          owner, offset)) {
1179                        if (recow) {
1180                                btrfs_release_path(path);
1181                                goto again;
1182                        }
1183                        err = 0;
1184                        break;
1185                }
1186                path->slots[0]++;
1187        }
1188fail:
1189        return err;
1190}
1191
1192static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
1193                                           struct btrfs_root *root,
1194                                           struct btrfs_path *path,
1195                                           u64 bytenr, u64 parent,
1196                                           u64 root_objectid, u64 owner,
1197                                           u64 offset, int refs_to_add)
1198{
1199        struct btrfs_key key;
1200        struct extent_buffer *leaf;
1201        u32 size;
1202        u32 num_refs;
1203        int ret;
1204
1205        key.objectid = bytenr;
1206        if (parent) {
1207                key.type = BTRFS_SHARED_DATA_REF_KEY;
1208                key.offset = parent;
1209                size = sizeof(struct btrfs_shared_data_ref);
1210        } else {
1211                key.type = BTRFS_EXTENT_DATA_REF_KEY;
1212                key.offset = hash_extent_data_ref(root_objectid,
1213                                                  owner, offset);
1214                size = sizeof(struct btrfs_extent_data_ref);
1215        }
1216
1217        ret = btrfs_insert_empty_item(trans, root, path, &key, size);
1218        if (ret && ret != -EEXIST)
1219                goto fail;
1220
1221        leaf = path->nodes[0];
1222        if (parent) {
1223                struct btrfs_shared_data_ref *ref;
1224                ref = btrfs_item_ptr(leaf, path->slots[0],
1225                                     struct btrfs_shared_data_ref);
1226                if (ret == 0) {
1227                        btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
1228                } else {
1229                        num_refs = btrfs_shared_data_ref_count(leaf, ref);
1230                        num_refs += refs_to_add;
1231                        btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
1232                }
1233        } else {
1234                struct btrfs_extent_data_ref *ref;
1235                while (ret == -EEXIST) {
1236                        ref = btrfs_item_ptr(leaf, path->slots[0],
1237                                             struct btrfs_extent_data_ref);
1238                        if (match_extent_data_ref(leaf, ref, root_objectid,
1239                                                  owner, offset))
1240                                break;
1241                        btrfs_release_path(path);
1242                        key.offset++;
1243                        ret = btrfs_insert_empty_item(trans, root, path, &key,
1244                                                      size);
1245                        if (ret && ret != -EEXIST)
1246                                goto fail;
1247
1248                        leaf = path->nodes[0];
1249                }
1250                ref = btrfs_item_ptr(leaf, path->slots[0],
1251                                     struct btrfs_extent_data_ref);
1252                if (ret == 0) {
1253                        btrfs_set_extent_data_ref_root(leaf, ref,
1254                                                       root_objectid);
1255                        btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
1256                        btrfs_set_extent_data_ref_offset(leaf, ref, offset);
1257                        btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
1258                } else {
1259                        num_refs = btrfs_extent_data_ref_count(leaf, ref);
1260                        num_refs += refs_to_add;
1261                        btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
1262                }
1263        }
1264        btrfs_mark_buffer_dirty(leaf);
1265        ret = 0;
1266fail:
1267        btrfs_release_path(path);
1268        return ret;
1269}
1270
1271static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
1272                                           struct btrfs_root *root,
1273                                           struct btrfs_path *path,
1274                                           int refs_to_drop)
1275{
1276        struct btrfs_key key;
1277        struct btrfs_extent_data_ref *ref1 = NULL;
1278        struct btrfs_shared_data_ref *ref2 = NULL;
1279        struct extent_buffer *leaf;
1280        u32 num_refs = 0;
1281        int ret = 0;
1282
1283        leaf = path->nodes[0];
1284        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1285
1286        if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1287                ref1 = btrfs_item_ptr(leaf, path->slots[0],
1288                                      struct btrfs_extent_data_ref);
1289                num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1290        } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1291                ref2 = btrfs_item_ptr(leaf, path->slots[0],
1292                                      struct btrfs_shared_data_ref);
1293                num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1294#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1295        } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1296                struct btrfs_extent_ref_v0 *ref0;
1297                ref0 = btrfs_item_ptr(leaf, path->slots[0],
1298                                      struct btrfs_extent_ref_v0);
1299                num_refs = btrfs_ref_count_v0(leaf, ref0);
1300#endif
1301        } else {
1302                BUG();
1303        }
1304
1305        BUG_ON(num_refs < refs_to_drop);
1306        num_refs -= refs_to_drop;
1307
1308        if (num_refs == 0) {
1309                ret = btrfs_del_item(trans, root, path);
1310        } else {
1311                if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1312                        btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1313                else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1314                        btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
1315#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1316                else {
1317                        struct btrfs_extent_ref_v0 *ref0;
1318                        ref0 = btrfs_item_ptr(leaf, path->slots[0],
1319                                        struct btrfs_extent_ref_v0);
1320                        btrfs_set_ref_count_v0(leaf, ref0, num_refs);
1321                }
1322#endif
1323                btrfs_mark_buffer_dirty(leaf);
1324        }
1325        return ret;
1326}
1327
1328static noinline u32 extent_data_ref_count(struct btrfs_root *root,
1329                                          struct btrfs_path *path,
1330                                          struct btrfs_extent_inline_ref *iref)
1331{
1332        struct btrfs_key key;
1333        struct extent_buffer *leaf;
1334        struct btrfs_extent_data_ref *ref1;
1335        struct btrfs_shared_data_ref *ref2;
1336        u32 num_refs = 0;
1337
1338        leaf = path->nodes[0];
1339        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1340        if (iref) {
1341                if (btrfs_extent_inline_ref_type(leaf, iref) ==
1342                    BTRFS_EXTENT_DATA_REF_KEY) {
1343                        ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1344                        num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1345                } else {
1346                        ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1347                        num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1348                }
1349        } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1350                ref1 = btrfs_item_ptr(leaf, path->slots[0],
1351                                      struct btrfs_extent_data_ref);
1352                num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1353        } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1354                ref2 = btrfs_item_ptr(leaf, path->slots[0],
1355                                      struct btrfs_shared_data_ref);
1356                num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1357#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1358        } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1359                struct btrfs_extent_ref_v0 *ref0;
1360                ref0 = btrfs_item_ptr(leaf, path->slots[0],
1361                                      struct btrfs_extent_ref_v0);
1362                num_refs = btrfs_ref_count_v0(leaf, ref0);
1363#endif
1364        } else {
1365                WARN_ON(1);
1366        }
1367        return num_refs;
1368}
1369
1370static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1371                                          struct btrfs_root *root,
1372                                          struct btrfs_path *path,
1373                                          u64 bytenr, u64 parent,
1374                                          u64 root_objectid)
1375{
1376        struct btrfs_key key;
1377        int ret;
1378
1379        key.objectid = bytenr;
1380        if (parent) {
1381                key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1382                key.offset = parent;
1383        } else {
1384                key.type = BTRFS_TREE_BLOCK_REF_KEY;
1385                key.offset = root_objectid;
1386        }
1387
1388        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1389        if (ret > 0)
1390                ret = -ENOENT;
1391#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1392        if (ret == -ENOENT && parent) {
1393                btrfs_release_path(path);
1394                key.type = BTRFS_EXTENT_REF_V0_KEY;
1395                ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1396                if (ret > 0)
1397                        ret = -ENOENT;
1398        }
1399#endif
1400        return ret;
1401}
1402
1403static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1404                                          struct btrfs_root *root,
1405                                          struct btrfs_path *path,
1406                                          u64 bytenr, u64 parent,
1407                                          u64 root_objectid)
1408{
1409        struct btrfs_key key;
1410        int ret;
1411
1412        key.objectid = bytenr;
1413        if (parent) {
1414                key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1415                key.offset = parent;
1416        } else {
1417                key.type = BTRFS_TREE_BLOCK_REF_KEY;
1418                key.offset = root_objectid;
1419        }
1420
1421        ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1422        btrfs_release_path(path);
1423        return ret;
1424}
1425
1426static inline int extent_ref_type(u64 parent, u64 owner)
1427{
1428        int type;
1429        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1430                if (parent > 0)
1431                        type = BTRFS_SHARED_BLOCK_REF_KEY;
1432                else
1433                        type = BTRFS_TREE_BLOCK_REF_KEY;
1434        } else {
1435                if (parent > 0)
1436                        type = BTRFS_SHARED_DATA_REF_KEY;
1437                else
1438                        type = BTRFS_EXTENT_DATA_REF_KEY;
1439        }
1440        return type;
1441}
1442
1443static int find_next_key(struct btrfs_path *path, int level,
1444                         struct btrfs_key *key)
1445
1446{
1447        for (; level < BTRFS_MAX_LEVEL; level++) {
1448                if (!path->nodes[level])
1449                        break;
1450                if (path->slots[level] + 1 >=
1451                    btrfs_header_nritems(path->nodes[level]))
1452                        continue;
1453                if (level == 0)
1454                        btrfs_item_key_to_cpu(path->nodes[level], key,
1455                                              path->slots[level] + 1);
1456                else
1457                        btrfs_node_key_to_cpu(path->nodes[level], key,
1458                                              path->slots[level] + 1);
1459                return 0;
1460        }
1461        return 1;
1462}
1463
1464/*
1465 * look for inline back ref. if back ref is found, *ref_ret is set
1466 * to the address of inline back ref, and 0 is returned.
1467 *
1468 * if back ref isn't found, *ref_ret is set to the address where it
1469 * should be inserted, and -ENOENT is returned.
1470 *
1471 * if insert is true and there are too many inline back refs, the path
1472 * points to the extent item, and -EAGAIN is returned.
1473 *
1474 * NOTE: inline back refs are ordered in the same way that back ref
1475 *       items in the tree are ordered.
1476 */
1477static noinline_for_stack
1478int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1479                                 struct btrfs_root *root,
1480                                 struct btrfs_path *path,
1481                                 struct btrfs_extent_inline_ref **ref_ret,
1482                                 u64 bytenr, u64 num_bytes,
1483                                 u64 parent, u64 root_objectid,
1484                                 u64 owner, u64 offset, int insert)
1485{
1486        struct btrfs_key key;
1487        struct extent_buffer *leaf;
1488        struct btrfs_extent_item *ei;
1489        struct btrfs_extent_inline_ref *iref;
1490        u64 flags;
1491        u64 item_size;
1492        unsigned long ptr;
1493        unsigned long end;
1494        int extra_size;
1495        int type;
1496        int want;
1497        int ret;
1498        int err = 0;
1499        bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
1500                                                 SKINNY_METADATA);
1501
1502        key.objectid = bytenr;
1503        key.type = BTRFS_EXTENT_ITEM_KEY;
1504        key.offset = num_bytes;
1505
1506        want = extent_ref_type(parent, owner);
1507        if (insert) {
1508                extra_size = btrfs_extent_inline_ref_size(want);
1509                path->keep_locks = 1;
1510        } else
1511                extra_size = -1;
1512
1513        /*
1514         * Owner is our parent level, so we can just add one to get the level
1515         * for the block we are interested in.
1516         */
1517        if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
1518                key.type = BTRFS_METADATA_ITEM_KEY;
1519                key.offset = owner;
1520        }
1521
1522again:
1523        ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1524        if (ret < 0) {
1525                err = ret;
1526                goto out;
1527        }
1528
1529        /*
1530         * We may be a newly converted file system which still has the old fat
1531         * extent entries for metadata, so try and see if we have one of those.
1532         */
1533        if (ret > 0 && skinny_metadata) {
1534                skinny_metadata = false;
1535                if (path->slots[0]) {
1536                        path->slots[0]--;
1537                        btrfs_item_key_to_cpu(path->nodes[0], &key,
1538                                              path->slots[0]);
1539                        if (key.objectid == bytenr &&
1540                            key.type == BTRFS_EXTENT_ITEM_KEY &&
1541                            key.offset == num_bytes)
1542                                ret = 0;
1543                }
1544                if (ret) {
1545                        key.objectid = bytenr;
1546                        key.type = BTRFS_EXTENT_ITEM_KEY;
1547                        key.offset = num_bytes;
1548                        btrfs_release_path(path);
1549                        goto again;
1550                }
1551        }
1552
1553        if (ret && !insert) {
1554                err = -ENOENT;
1555                goto out;
1556        } else if (WARN_ON(ret)) {
1557                err = -EIO;
1558                goto out;
1559        }
1560
1561        leaf = path->nodes[0];
1562        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1563#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1564        if (item_size < sizeof(*ei)) {
1565                if (!insert) {
1566                        err = -ENOENT;
1567                        goto out;
1568                }
1569                ret = convert_extent_item_v0(trans, root, path, owner,
1570                                             extra_size);
1571                if (ret < 0) {
1572                        err = ret;
1573                        goto out;
1574                }
1575                leaf = path->nodes[0];
1576                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1577        }
1578#endif
1579        BUG_ON(item_size < sizeof(*ei));
1580
1581        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1582        flags = btrfs_extent_flags(leaf, ei);
1583
1584        ptr = (unsigned long)(ei + 1);
1585        end = (unsigned long)ei + item_size;
1586
1587        if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
1588                ptr += sizeof(struct btrfs_tree_block_info);
1589                BUG_ON(ptr > end);
1590        }
1591
1592        err = -ENOENT;
1593        while (1) {
1594                if (ptr >= end) {
1595                        WARN_ON(ptr > end);
1596                        break;
1597                }
1598                iref = (struct btrfs_extent_inline_ref *)ptr;
1599                type = btrfs_extent_inline_ref_type(leaf, iref);
1600                if (want < type)
1601                        break;
1602                if (want > type) {
1603                        ptr += btrfs_extent_inline_ref_size(type);
1604                        continue;
1605                }
1606
1607                if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1608                        struct btrfs_extent_data_ref *dref;
1609                        dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1610                        if (match_extent_data_ref(leaf, dref, root_objectid,
1611                                                  owner, offset)) {
1612                                err = 0;
1613                                break;
1614                        }
1615                        if (hash_extent_data_ref_item(leaf, dref) <
1616                            hash_extent_data_ref(root_objectid, owner, offset))
1617                                break;
1618                } else {
1619                        u64 ref_offset;
1620                        ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1621                        if (parent > 0) {
1622                                if (parent == ref_offset) {
1623                                        err = 0;
1624                                        break;
1625                                }
1626                                if (ref_offset < parent)
1627                                        break;
1628                        } else {
1629                                if (root_objectid == ref_offset) {
1630                                        err = 0;
1631                                        break;
1632                                }
1633                                if (ref_offset < root_objectid)
1634                                        break;
1635                        }
1636                }
1637                ptr += btrfs_extent_inline_ref_size(type);
1638        }
1639        if (err == -ENOENT && insert) {
1640                if (item_size + extra_size >=
1641                    BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1642                        err = -EAGAIN;
1643                        goto out;
1644                }
1645                /*
1646                 * To add new inline back ref, we have to make sure
1647                 * there is no corresponding back ref item.
1648                 * For simplicity, we just do not add new inline back
1649                 * ref if there is any kind of item for this block
1650                 */
1651                if (find_next_key(path, 0, &key) == 0 &&
1652                    key.objectid == bytenr &&
1653                    key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1654                        err = -EAGAIN;
1655                        goto out;
1656                }
1657        }
1658        *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1659out:
1660        if (insert) {
1661                path->keep_locks = 0;
1662                btrfs_unlock_up_safe(path, 1);
1663        }
1664        return err;
1665}
1666
1667/*
1668 * helper to add new inline back ref
1669 */
1670static noinline_for_stack
1671void setup_inline_extent_backref(struct btrfs_root *root,
1672                                 struct btrfs_path *path,
1673                                 struct btrfs_extent_inline_ref *iref,
1674                                 u64 parent, u64 root_objectid,
1675                                 u64 owner, u64 offset, int refs_to_add,
1676                                 struct btrfs_delayed_extent_op *extent_op)
1677{
1678        struct extent_buffer *leaf;
1679        struct btrfs_extent_item *ei;
1680        unsigned long ptr;
1681        unsigned long end;
1682        unsigned long item_offset;
1683        u64 refs;
1684        int size;
1685        int type;
1686
1687        leaf = path->nodes[0];
1688        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1689        item_offset = (unsigned long)iref - (unsigned long)ei;
1690
1691        type = extent_ref_type(parent, owner);
1692        size = btrfs_extent_inline_ref_size(type);
1693
1694        btrfs_extend_item(root, path, size);
1695
1696        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1697        refs = btrfs_extent_refs(leaf, ei);
1698        refs += refs_to_add;
1699        btrfs_set_extent_refs(leaf, ei, refs);
1700        if (extent_op)
1701                __run_delayed_extent_op(extent_op, leaf, ei);
1702
1703        ptr = (unsigned long)ei + item_offset;
1704        end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1705        if (ptr < end - size)
1706                memmove_extent_buffer(leaf, ptr + size, ptr,
1707                                      end - size - ptr);
1708
1709        iref = (struct btrfs_extent_inline_ref *)ptr;
1710        btrfs_set_extent_inline_ref_type(leaf, iref, type);
1711        if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1712                struct btrfs_extent_data_ref *dref;
1713                dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1714                btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1715                btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1716                btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1717                btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1718        } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1719                struct btrfs_shared_data_ref *sref;
1720                sref = (struct btrfs_shared_data_ref *)(iref + 1);
1721                btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1722                btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1723        } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1724                btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1725        } else {
1726                btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1727        }
1728        btrfs_mark_buffer_dirty(leaf);
1729}
1730
1731static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1732                                 struct btrfs_root *root,
1733                                 struct btrfs_path *path,
1734                                 struct btrfs_extent_inline_ref **ref_ret,
1735                                 u64 bytenr, u64 num_bytes, u64 parent,
1736                                 u64 root_objectid, u64 owner, u64 offset)
1737{
1738        int ret;
1739
1740        ret = lookup_inline_extent_backref(trans, root, path, ref_ret,
1741                                           bytenr, num_bytes, parent,
1742                                           root_objectid, owner, offset, 0);
1743        if (ret != -ENOENT)
1744                return ret;
1745
1746        btrfs_release_path(path);
1747        *ref_ret = NULL;
1748
1749        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1750                ret = lookup_tree_block_ref(trans, root, path, bytenr, parent,
1751                                            root_objectid);
1752        } else {
1753                ret = lookup_extent_data_ref(trans, root, path, bytenr, parent,
1754                                             root_objectid, owner, offset);
1755        }
1756        return ret;
1757}
1758
1759/*
1760 * helper to update/remove inline back ref
1761 */
1762static noinline_for_stack
1763void update_inline_extent_backref(struct btrfs_root *root,
1764                                  struct btrfs_path *path,
1765                                  struct btrfs_extent_inline_ref *iref,
1766                                  int refs_to_mod,
1767                                  struct btrfs_delayed_extent_op *extent_op)
1768{
1769        struct extent_buffer *leaf;
1770        struct btrfs_extent_item *ei;
1771        struct btrfs_extent_data_ref *dref = NULL;
1772        struct btrfs_shared_data_ref *sref = NULL;
1773        unsigned long ptr;
1774        unsigned long end;
1775        u32 item_size;
1776        int size;
1777        int type;
1778        u64 refs;
1779
1780        leaf = path->nodes[0];
1781        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1782        refs = btrfs_extent_refs(leaf, ei);
1783        WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1784        refs += refs_to_mod;
1785        btrfs_set_extent_refs(leaf, ei, refs);
1786        if (extent_op)
1787                __run_delayed_extent_op(extent_op, leaf, ei);
1788
1789        type = btrfs_extent_inline_ref_type(leaf, iref);
1790
1791        if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1792                dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1793                refs = btrfs_extent_data_ref_count(leaf, dref);
1794        } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1795                sref = (struct btrfs_shared_data_ref *)(iref + 1);
1796                refs = btrfs_shared_data_ref_count(leaf, sref);
1797        } else {
1798                refs = 1;
1799                BUG_ON(refs_to_mod != -1);
1800        }
1801
1802        BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1803        refs += refs_to_mod;
1804
1805        if (refs > 0) {
1806                if (type == BTRFS_EXTENT_DATA_REF_KEY)
1807                        btrfs_set_extent_data_ref_count(leaf, dref, refs);
1808                else
1809                        btrfs_set_shared_data_ref_count(leaf, sref, refs);
1810        } else {
1811                size =  btrfs_extent_inline_ref_size(type);
1812                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1813                ptr = (unsigned long)iref;
1814                end = (unsigned long)ei + item_size;
1815                if (ptr + size < end)
1816                        memmove_extent_buffer(leaf, ptr, ptr + size,
1817                                              end - ptr - size);
1818                item_size -= size;
1819                btrfs_truncate_item(root, path, item_size, 1);
1820        }
1821        btrfs_mark_buffer_dirty(leaf);
1822}
1823
1824static noinline_for_stack
1825int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1826                                 struct btrfs_root *root,
1827                                 struct btrfs_path *path,
1828                                 u64 bytenr, u64 num_bytes, u64 parent,
1829                                 u64 root_objectid, u64 owner,
1830                                 u64 offset, int refs_to_add,
1831                                 struct btrfs_delayed_extent_op *extent_op)
1832{
1833        struct btrfs_extent_inline_ref *iref;
1834        int ret;
1835
1836        ret = lookup_inline_extent_backref(trans, root, path, &iref,
1837                                           bytenr, num_bytes, parent,
1838                                           root_objectid, owner, offset, 1);
1839        if (ret == 0) {
1840                BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1841                update_inline_extent_backref(root, path, iref,
1842                                             refs_to_add, extent_op);
1843        } else if (ret == -ENOENT) {
1844                setup_inline_extent_backref(root, path, iref, parent,
1845                                            root_objectid, owner, offset,
1846                                            refs_to_add, extent_op);
1847                ret = 0;
1848        }
1849        return ret;
1850}
1851
1852static int insert_extent_backref(struct btrfs_trans_handle *trans,
1853                                 struct btrfs_root *root,
1854                                 struct btrfs_path *path,
1855                                 u64 bytenr, u64 parent, u64 root_objectid,
1856                                 u64 owner, u64 offset, int refs_to_add)
1857{
1858        int ret;
1859        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1860                BUG_ON(refs_to_add != 1);
1861                ret = insert_tree_block_ref(trans, root, path, bytenr,
1862                                            parent, root_objectid);
1863        } else {
1864                ret = insert_extent_data_ref(trans, root, path, bytenr,
1865                                             parent, root_objectid,
1866                                             owner, offset, refs_to_add);
1867        }
1868        return ret;
1869}
1870
1871static int remove_extent_backref(struct btrfs_trans_handle *trans,
1872                                 struct btrfs_root *root,
1873                                 struct btrfs_path *path,
1874                                 struct btrfs_extent_inline_ref *iref,
1875                                 int refs_to_drop, int is_data)
1876{
1877        int ret = 0;
1878
1879        BUG_ON(!is_data && refs_to_drop != 1);
1880        if (iref) {
1881                update_inline_extent_backref(root, path, iref,
1882                                             -refs_to_drop, NULL);
1883        } else if (is_data) {
1884                ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
1885        } else {
1886                ret = btrfs_del_item(trans, root, path);
1887        }
1888        return ret;
1889}
1890
1891static int btrfs_issue_discard(struct block_device *bdev,
1892                                u64 start, u64 len)
1893{
1894        return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0);
1895}
1896
1897static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1898                                u64 num_bytes, u64 *actual_bytes)
1899{
1900        int ret;
1901        u64 discarded_bytes = 0;
1902        struct btrfs_bio *bbio = NULL;
1903
1904
1905        /* Tell the block device(s) that the sectors can be discarded */
1906        ret = btrfs_map_block(root->fs_info, REQ_DISCARD,
1907                              bytenr, &num_bytes, &bbio, 0);
1908        /* Error condition is -ENOMEM */
1909        if (!ret) {
1910                struct btrfs_bio_stripe *stripe = bbio->stripes;
1911                int i;
1912
1913
1914                for (i = 0; i < bbio->num_stripes; i++, stripe++) {
1915                        if (!stripe->dev->can_discard)
1916                                continue;
1917
1918                        ret = btrfs_issue_discard(stripe->dev->bdev,
1919                                                  stripe->physical,
1920                                                  stripe->length);
1921                        if (!ret)
1922                                discarded_bytes += stripe->length;
1923                        else if (ret != -EOPNOTSUPP)
1924                                break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
1925
1926                        /*
1927                         * Just in case we get back EOPNOTSUPP for some reason,
1928                         * just ignore the return value so we don't screw up
1929                         * people calling discard_extent.
1930                         */
1931                        ret = 0;
1932                }
1933                kfree(bbio);
1934        }
1935
1936        if (actual_bytes)
1937                *actual_bytes = discarded_bytes;
1938
1939
1940        if (ret == -EOPNOTSUPP)
1941                ret = 0;
1942        return ret;
1943}
1944
1945/* Can return -ENOMEM */
1946int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1947                         struct btrfs_root *root,
1948                         u64 bytenr, u64 num_bytes, u64 parent,
1949                         u64 root_objectid, u64 owner, u64 offset, int for_cow)
1950{
1951        int ret;
1952        struct btrfs_fs_info *fs_info = root->fs_info;
1953
1954        BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
1955               root_objectid == BTRFS_TREE_LOG_OBJECTID);
1956
1957        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1958                ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
1959                                        num_bytes,
1960                                        parent, root_objectid, (int)owner,
1961                                        BTRFS_ADD_DELAYED_REF, NULL, for_cow);
1962        } else {
1963                ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
1964                                        num_bytes,
1965                                        parent, root_objectid, owner, offset,
1966                                        BTRFS_ADD_DELAYED_REF, NULL, for_cow);
1967        }
1968        return ret;
1969}
1970
1971static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1972                                  struct btrfs_root *root,
1973                                  u64 bytenr, u64 num_bytes,
1974                                  u64 parent, u64 root_objectid,
1975                                  u64 owner, u64 offset, int refs_to_add,
1976                                  struct btrfs_delayed_extent_op *extent_op)
1977{
1978        struct btrfs_path *path;
1979        struct extent_buffer *leaf;
1980        struct btrfs_extent_item *item;
1981        u64 refs;
1982        int ret;
1983
1984        path = btrfs_alloc_path();
1985        if (!path)
1986                return -ENOMEM;
1987
1988        path->reada = 1;
1989        path->leave_spinning = 1;
1990        /* this will setup the path even if it fails to insert the back ref */
1991        ret = insert_inline_extent_backref(trans, root->fs_info->extent_root,
1992                                           path, bytenr, num_bytes, parent,
1993                                           root_objectid, owner, offset,
1994                                           refs_to_add, extent_op);
1995        if (ret != -EAGAIN)
1996                goto out;
1997
1998        leaf = path->nodes[0];
1999        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2000        refs = btrfs_extent_refs(leaf, item);
2001        btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
2002        if (extent_op)
2003                __run_delayed_extent_op(extent_op, leaf, item);
2004
2005        btrfs_mark_buffer_dirty(leaf);
2006        btrfs_release_path(path);
2007
2008        path->reada = 1;
2009        path->leave_spinning = 1;
2010
2011        /* now insert the actual backref */
2012        ret = insert_extent_backref(trans, root->fs_info->extent_root,
2013                                    path, bytenr, parent, root_objectid,
2014                                    owner, offset, refs_to_add);
2015        if (ret)
2016                btrfs_abort_transaction(trans, root, ret);
2017out:
2018        btrfs_free_path(path);
2019        return ret;
2020}
2021
2022static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
2023                                struct btrfs_root *root,
2024                                struct btrfs_delayed_ref_node *node,
2025                                struct btrfs_delayed_extent_op *extent_op,
2026                                int insert_reserved)
2027{
2028        int ret = 0;
2029        struct btrfs_delayed_data_ref *ref;
2030        struct btrfs_key ins;
2031        u64 parent = 0;
2032        u64 ref_root = 0;
2033        u64 flags = 0;
2034
2035        ins.objectid = node->bytenr;
2036        ins.offset = node->num_bytes;
2037        ins.type = BTRFS_EXTENT_ITEM_KEY;
2038
2039        ref = btrfs_delayed_node_to_data_ref(node);
2040        trace_run_delayed_data_ref(node, ref, node->action);
2041
2042        if (node->type == BTRFS_SHARED_DATA_REF_KEY)
2043                parent = ref->parent;
2044        else
2045                ref_root = ref->root;
2046
2047        if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2048                if (extent_op)
2049                        flags |= extent_op->flags_to_set;
2050                ret = alloc_reserved_file_extent(trans, root,
2051                                                 parent, ref_root, flags,
2052                                                 ref->objectid, ref->offset,
2053                                                 &ins, node->ref_mod);
2054        } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2055                ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
2056                                             node->num_bytes, parent,
2057                                             ref_root, ref->objectid,
2058                                             ref->offset, node->ref_mod,
2059                                             extent_op);
2060        } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2061                ret = __btrfs_free_extent(trans, root, node->bytenr,
2062                                          node->num_bytes, parent,
2063                                          ref_root, ref->objectid,
2064                                          ref->offset, node->ref_mod,
2065                                          extent_op);
2066        } else {
2067                BUG();
2068        }
2069        return ret;
2070}
2071
2072static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
2073                                    struct extent_buffer *leaf,
2074                                    struct btrfs_extent_item *ei)
2075{
2076        u64 flags = btrfs_extent_flags(leaf, ei);
2077        if (extent_op->update_flags) {
2078                flags |= extent_op->flags_to_set;
2079                btrfs_set_extent_flags(leaf, ei, flags);
2080        }
2081
2082        if (extent_op->update_key) {
2083                struct btrfs_tree_block_info *bi;
2084                BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2085                bi = (struct btrfs_tree_block_info *)(ei + 1);
2086                btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
2087        }
2088}
2089
2090static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
2091                                 struct btrfs_root *root,
2092                                 struct btrfs_delayed_ref_node *node,
2093                                 struct btrfs_delayed_extent_op *extent_op)
2094{
2095        struct btrfs_key key;
2096        struct btrfs_path *path;
2097        struct btrfs_extent_item *ei;
2098        struct extent_buffer *leaf;
2099        u32 item_size;
2100        int ret;
2101        int err = 0;
2102        int metadata = !extent_op->is_data;
2103
2104        if (trans->aborted)
2105                return 0;
2106
2107        if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA))
2108                metadata = 0;
2109
2110        path = btrfs_alloc_path();
2111        if (!path)
2112                return -ENOMEM;
2113
2114        key.objectid = node->bytenr;
2115
2116        if (metadata) {
2117                key.type = BTRFS_METADATA_ITEM_KEY;
2118                key.offset = extent_op->level;
2119        } else {
2120                key.type = BTRFS_EXTENT_ITEM_KEY;
2121                key.offset = node->num_bytes;
2122        }
2123
2124again:
2125        path->reada = 1;
2126        path->leave_spinning = 1;
2127        ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
2128                                path, 0, 1);
2129        if (ret < 0) {
2130                err = ret;
2131                goto out;
2132        }
2133        if (ret > 0) {
2134                if (metadata) {
2135                        if (path->slots[0] > 0) {
2136                                path->slots[0]--;
2137                                btrfs_item_key_to_cpu(path->nodes[0], &key,
2138                                                      path->slots[0]);
2139                                if (key.objectid == node->bytenr &&
2140                                    key.type == BTRFS_EXTENT_ITEM_KEY &&
2141                                    key.offset == node->num_bytes)
2142                                        ret = 0;
2143                        }
2144                        if (ret > 0) {
2145                                btrfs_release_path(path);
2146                                metadata = 0;
2147
2148                                key.objectid = node->bytenr;
2149                                key.offset = node->num_bytes;
2150                                key.type = BTRFS_EXTENT_ITEM_KEY;
2151                                goto again;
2152                        }
2153                } else {
2154                        err = -EIO;
2155                        goto out;
2156                }
2157        }
2158
2159        leaf = path->nodes[0];
2160        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2161#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2162        if (item_size < sizeof(*ei)) {
2163                ret = convert_extent_item_v0(trans, root->fs_info->extent_root,
2164                                             path, (u64)-1, 0);
2165                if (ret < 0) {
2166                        err = ret;
2167                        goto out;
2168                }
2169                leaf = path->nodes[0];
2170                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2171        }
2172#endif
2173        BUG_ON(item_size < sizeof(*ei));
2174        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2175        __run_delayed_extent_op(extent_op, leaf, ei);
2176
2177        btrfs_mark_buffer_dirty(leaf);
2178out:
2179        btrfs_free_path(path);
2180        return err;
2181}
2182
2183static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
2184                                struct btrfs_root *root,
2185                                struct btrfs_delayed_ref_node *node,
2186                                struct btrfs_delayed_extent_op *extent_op,
2187                                int insert_reserved)
2188{
2189        int ret = 0;
2190        struct btrfs_delayed_tree_ref *ref;
2191        struct btrfs_key ins;
2192        u64 parent = 0;
2193        u64 ref_root = 0;
2194        bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
2195                                                 SKINNY_METADATA);
2196
2197        ref = btrfs_delayed_node_to_tree_ref(node);
2198        trace_run_delayed_tree_ref(node, ref, node->action);
2199
2200        if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2201                parent = ref->parent;
2202        else
2203                ref_root = ref->root;
2204
2205        ins.objectid = node->bytenr;
2206        if (skinny_metadata) {
2207                ins.offset = ref->level;
2208                ins.type = BTRFS_METADATA_ITEM_KEY;
2209        } else {
2210                ins.offset = node->num_bytes;
2211                ins.type = BTRFS_EXTENT_ITEM_KEY;
2212        }
2213
2214        BUG_ON(node->ref_mod != 1);
2215        if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2216                BUG_ON(!extent_op || !extent_op->update_flags);
2217                ret = alloc_reserved_tree_block(trans, root,
2218                                                parent, ref_root,
2219                                                extent_op->flags_to_set,
2220                                                &extent_op->key,
2221                                                ref->level, &ins);
2222        } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2223                ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
2224                                             node->num_bytes, parent, ref_root,
2225                                             ref->level, 0, 1, extent_op);
2226        } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2227                ret = __btrfs_free_extent(trans, root, node->bytenr,
2228                                          node->num_bytes, parent, ref_root,
2229                                          ref->level, 0, 1, extent_op);
2230        } else {
2231                BUG();
2232        }
2233        return ret;
2234}
2235
2236/* helper function to actually process a single delayed ref entry */
2237static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2238                               struct btrfs_root *root,
2239                               struct btrfs_delayed_ref_node *node,
2240                               struct btrfs_delayed_extent_op *extent_op,
2241                               int insert_reserved)
2242{
2243        int ret = 0;
2244
2245        if (trans->aborted) {
2246                if (insert_reserved)
2247                        btrfs_pin_extent(root, node->bytenr,
2248                                         node->num_bytes, 1);
2249                return 0;
2250        }
2251
2252        if (btrfs_delayed_ref_is_head(node)) {
2253                struct btrfs_delayed_ref_head *head;
2254                /*
2255                 * we've hit the end of the chain and we were supposed
2256                 * to insert this extent into the tree.  But, it got
2257                 * deleted before we ever needed to insert it, so all
2258                 * we have to do is clean up the accounting
2259                 */
2260                BUG_ON(extent_op);
2261                head = btrfs_delayed_node_to_head(node);
2262                trace_run_delayed_ref_head(node, head, node->action);
2263
2264                if (insert_reserved) {
2265                        btrfs_pin_extent(root, node->bytenr,
2266                                         node->num_bytes, 1);
2267                        if (head->is_data) {
2268                                ret = btrfs_del_csums(trans, root,
2269                                                      node->bytenr,
2270                                                      node->num_bytes);
2271                        }
2272                }
2273                return ret;
2274        }
2275
2276        if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
2277            node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2278                ret = run_delayed_tree_ref(trans, root, node, extent_op,
2279                                           insert_reserved);
2280        else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
2281                 node->type == BTRFS_SHARED_DATA_REF_KEY)
2282                ret = run_delayed_data_ref(trans, root, node, extent_op,
2283                                           insert_reserved);
2284        else
2285                BUG();
2286        return ret;
2287}
2288
2289static noinline struct btrfs_delayed_ref_node *
2290select_delayed_ref(struct btrfs_delayed_ref_head *head)
2291{
2292        struct rb_node *node;
2293        struct btrfs_delayed_ref_node *ref, *last = NULL;;
2294
2295        /*
2296         * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
2297         * this prevents ref count from going down to zero when
2298         * there still are pending delayed ref.
2299         */
2300        node = rb_first(&head->ref_root);
2301        while (node) {
2302                ref = rb_entry(node, struct btrfs_delayed_ref_node,
2303                                rb_node);
2304                if (ref->action == BTRFS_ADD_DELAYED_REF)
2305                        return ref;
2306                else if (last == NULL)
2307                        last = ref;
2308                node = rb_next(node);
2309        }
2310        return last;
2311}
2312
2313/*
2314 * Returns 0 on success or if called with an already aborted transaction.
2315 * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2316 */
2317static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2318                                             struct btrfs_root *root,
2319                                             unsigned long nr)
2320{
2321        struct btrfs_delayed_ref_root *delayed_refs;
2322        struct btrfs_delayed_ref_node *ref;
2323        struct btrfs_delayed_ref_head *locked_ref = NULL;
2324        struct btrfs_delayed_extent_op *extent_op;
2325        struct btrfs_fs_info *fs_info = root->fs_info;
2326        ktime_t start = ktime_get();
2327        int ret;
2328        unsigned long count = 0;
2329        unsigned long actual_count = 0;
2330        int must_insert_reserved = 0;
2331
2332        delayed_refs = &trans->transaction->delayed_refs;
2333        while (1) {
2334                if (!locked_ref) {
2335                        if (count >= nr)
2336                                break;
2337
2338                        spin_lock(&delayed_refs->lock);
2339                        locked_ref = btrfs_select_ref_head(trans);
2340                        if (!locked_ref) {
2341                                spin_unlock(&delayed_refs->lock);
2342                                break;
2343                        }
2344
2345                        /* grab the lock that says we are going to process
2346                         * all the refs for this head */
2347                        ret = btrfs_delayed_ref_lock(trans, locked_ref);
2348                        spin_unlock(&delayed_refs->lock);
2349                        /*
2350                         * we may have dropped the spin lock to get the head
2351                         * mutex lock, and that might have given someone else
2352                         * time to free the head.  If that's true, it has been
2353                         * removed from our list and we can move on.
2354                         */
2355                        if (ret == -EAGAIN) {
2356                                locked_ref = NULL;
2357                                count++;
2358                                continue;
2359                        }
2360                }
2361
2362                /*
2363                 * We need to try and merge add/drops of the same ref since we
2364                 * can run into issues with relocate dropping the implicit ref
2365                 * and then it being added back again before the drop can
2366                 * finish.  If we merged anything we need to re-loop so we can
2367                 * get a good ref.
2368                 */
2369                spin_lock(&locked_ref->lock);
2370                btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
2371                                         locked_ref);
2372
2373                /*
2374                 * locked_ref is the head node, so we have to go one
2375                 * node back for any delayed ref updates
2376                 */
2377                ref = select_delayed_ref(locked_ref);
2378
2379                if (ref && ref->seq &&
2380                    btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
2381                        spin_unlock(&locked_ref->lock);
2382                        btrfs_delayed_ref_unlock(locked_ref);
2383                        spin_lock(&delayed_refs->lock);
2384                        locked_ref->processing = 0;
2385                        delayed_refs->num_heads_ready++;
2386                        spin_unlock(&delayed_refs->lock);
2387                        locked_ref = NULL;
2388                        cond_resched();
2389                        count++;
2390                        continue;
2391                }
2392
2393                /*
2394                 * record the must insert reserved flag before we
2395                 * drop the spin lock.
2396                 */
2397                must_insert_reserved = locked_ref->must_insert_reserved;
2398                locked_ref->must_insert_reserved = 0;
2399
2400                extent_op = locked_ref->extent_op;
2401                locked_ref->extent_op = NULL;
2402
2403                if (!ref) {
2404
2405
2406                        /* All delayed refs have been processed, Go ahead
2407                         * and send the head node to run_one_delayed_ref,
2408                         * so that any accounting fixes can happen
2409                         */
2410                        ref = &locked_ref->node;
2411
2412                        if (extent_op && must_insert_reserved) {
2413                                btrfs_free_delayed_extent_op(extent_op);
2414                                extent_op = NULL;
2415                        }
2416
2417                        if (extent_op) {
2418                                spin_unlock(&locked_ref->lock);
2419                                ret = run_delayed_extent_op(trans, root,
2420                                                            ref, extent_op);
2421                                btrfs_free_delayed_extent_op(extent_op);
2422
2423                                if (ret) {
2424                                        /*
2425                                         * Need to reset must_insert_reserved if
2426                                         * there was an error so the abort stuff
2427                                         * can cleanup the reserved space
2428                                         * properly.
2429                                         */
2430                                        if (must_insert_reserved)
2431                                                locked_ref->must_insert_reserved = 1;
2432                                        locked_ref->processing = 0;
2433                                        btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
2434                                        btrfs_delayed_ref_unlock(locked_ref);
2435                                        return ret;
2436                                }
2437                                continue;
2438                        }
2439
2440                        /*
2441                         * Need to drop our head ref lock and re-aqcuire the
2442                         * delayed ref lock and then re-check to make sure
2443                         * nobody got added.
2444                         */
2445                        spin_unlock(&locked_ref->lock);
2446                        spin_lock(&delayed_refs->lock);
2447                        spin_lock(&locked_ref->lock);
2448                        if (rb_first(&locked_ref->ref_root) ||
2449                            locked_ref->extent_op) {
2450                                spin_unlock(&locked_ref->lock);
2451                                spin_unlock(&delayed_refs->lock);
2452                                continue;
2453                        }
2454                        ref->in_tree = 0;
2455                        delayed_refs->num_heads--;
2456                        rb_erase(&locked_ref->href_node,
2457                                 &delayed_refs->href_root);
2458                        spin_unlock(&delayed_refs->lock);
2459                } else {
2460                        actual_count++;
2461                        ref->in_tree = 0;
2462                        rb_erase(&ref->rb_node, &locked_ref->ref_root);
2463                }
2464                atomic_dec(&delayed_refs->num_entries);
2465
2466                if (!btrfs_delayed_ref_is_head(ref)) {
2467                        /*
2468                         * when we play the delayed ref, also correct the
2469                         * ref_mod on head
2470                         */
2471                        switch (ref->action) {
2472                        case BTRFS_ADD_DELAYED_REF:
2473                        case BTRFS_ADD_DELAYED_EXTENT:
2474                                locked_ref->node.ref_mod -= ref->ref_mod;
2475                                break;
2476                        case BTRFS_DROP_DELAYED_REF:
2477                                locked_ref->node.ref_mod += ref->ref_mod;
2478                                break;
2479                        default:
2480                                WARN_ON(1);
2481                        }
2482                }
2483                spin_unlock(&locked_ref->lock);
2484
2485                ret = run_one_delayed_ref(trans, root, ref, extent_op,
2486                                          must_insert_reserved);
2487
2488                btrfs_free_delayed_extent_op(extent_op);
2489                if (ret) {
2490                        locked_ref->processing = 0;
2491                        btrfs_delayed_ref_unlock(locked_ref);
2492                        btrfs_put_delayed_ref(ref);
2493                        btrfs_debug(fs_info, "run_one_delayed_ref returned %d", ret);
2494                        return ret;
2495                }
2496
2497                /*
2498                 * If this node is a head, that means all the refs in this head
2499                 * have been dealt with, and we will pick the next head to deal
2500                 * with, so we must unlock the head and drop it from the cluster
2501                 * list before we release it.
2502                 */
2503                if (btrfs_delayed_ref_is_head(ref)) {
2504                        btrfs_delayed_ref_unlock(locked_ref);
2505                        locked_ref = NULL;
2506                }
2507                btrfs_put_delayed_ref(ref);
2508                count++;
2509                cond_resched();
2510        }
2511
2512        /*
2513         * We don't want to include ref heads since we can have empty ref heads
2514         * and those will drastically skew our runtime down since we just do
2515         * accounting, no actual extent tree updates.
2516         */
2517        if (actual_count > 0) {
2518                u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
2519                u64 avg;
2520
2521                /*
2522                 * We weigh the current average higher than our current runtime
2523                 * to avoid large swings in the average.
2524                 */
2525                spin_lock(&delayed_refs->lock);
2526                avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
2527                avg = div64_u64(avg, 4);
2528                fs_info->avg_delayed_ref_runtime = avg;
2529                spin_unlock(&delayed_refs->lock);
2530        }
2531        return 0;
2532}
2533
2534#ifdef SCRAMBLE_DELAYED_REFS
2535/*
2536 * Normally delayed refs get processed in ascending bytenr order. This
2537 * correlates in most cases to the order added. To expose dependencies on this
2538 * order, we start to process the tree in the middle instead of the beginning
2539 */
2540static u64 find_middle(struct rb_root *root)
2541{
2542        struct rb_node *n = root->rb_node;
2543        struct btrfs_delayed_ref_node *entry;
2544        int alt = 1;
2545        u64 middle;
2546        u64 first = 0, last = 0;
2547
2548        n = rb_first(root);
2549        if (n) {
2550                entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2551                first = entry->bytenr;
2552        }
2553        n = rb_last(root);
2554        if (n) {
2555                entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2556                last = entry->bytenr;
2557        }
2558        n = root->rb_node;
2559
2560        while (n) {
2561                entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2562                WARN_ON(!entry->in_tree);
2563
2564                middle = entry->bytenr;
2565
2566                if (alt)
2567                        n = n->rb_left;
2568                else
2569                        n = n->rb_right;
2570
2571                alt = 1 - alt;
2572        }
2573        return middle;
2574}
2575#endif
2576
2577int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
2578                                         struct btrfs_fs_info *fs_info)
2579{
2580        struct qgroup_update *qgroup_update;
2581        int ret = 0;
2582
2583        if (list_empty(&trans->qgroup_ref_list) !=
2584            !trans->delayed_ref_elem.seq) {
2585                /* list without seq or seq without list */
2586                btrfs_err(fs_info,
2587                        "qgroup accounting update error, list is%s empty, seq is %#x.%x",
2588                        list_empty(&trans->qgroup_ref_list) ? "" : " not",
2589                        (u32)(trans->delayed_ref_elem.seq >> 32),
2590                        (u32)trans->delayed_ref_elem.seq);
2591                BUG();
2592        }
2593
2594        if (!trans->delayed_ref_elem.seq)
2595                return 0;
2596
2597        while (!list_empty(&trans->qgroup_ref_list)) {
2598                qgroup_update = list_first_entry(&trans->qgroup_ref_list,
2599                                                 struct qgroup_update, list);
2600                list_del(&qgroup_update->list);
2601                if (!ret)
2602                        ret = btrfs_qgroup_account_ref(
2603                                        trans, fs_info, qgroup_update->node,
2604                                        qgroup_update->extent_op);
2605                kfree(qgroup_update);
2606        }
2607
2608        btrfs_put_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
2609
2610        return ret;
2611}
2612
2613static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
2614{
2615        u64 num_bytes;
2616
2617        num_bytes = heads * (sizeof(struct btrfs_extent_item) +
2618                             sizeof(struct btrfs_extent_inline_ref));
2619        if (!btrfs_fs_incompat(root->fs_info, SKINNY_METADATA))
2620                num_bytes += heads * sizeof(struct btrfs_tree_block_info);
2621
2622        /*
2623         * We don't ever fill up leaves all the way so multiply by 2 just to be
2624         * closer to what we're really going to want to ouse.
2625         */
2626        return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
2627}
2628
2629int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
2630                                       struct btrfs_root *root)
2631{
2632        struct btrfs_block_rsv *global_rsv;
2633        u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
2634        u64 num_bytes;
2635        int ret = 0;
2636
2637        num_bytes = btrfs_calc_trans_metadata_size(root, 1);
2638        num_heads = heads_to_leaves(root, num_heads);
2639        if (num_heads > 1)
2640                num_bytes += (num_heads - 1) * root->leafsize;
2641        num_bytes <<= 1;
2642        global_rsv = &root->fs_info->global_block_rsv;
2643
2644        /*
2645         * If we can't allocate any more chunks lets make sure we have _lots_ of
2646         * wiggle room since running delayed refs can create more delayed refs.
2647         */
2648        if (global_rsv->space_info->full)
2649                num_bytes <<= 1;
2650
2651        spin_lock(&global_rsv->lock);
2652        if (global_rsv->reserved <= num_bytes)
2653                ret = 1;
2654        spin_unlock(&global_rsv->lock);
2655        return ret;
2656}
2657
2658int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
2659                                       struct btrfs_root *root)
2660{
2661        struct btrfs_fs_info *fs_info = root->fs_info;
2662        u64 num_entries =
2663                atomic_read(&trans->transaction->delayed_refs.num_entries);
2664        u64 avg_runtime;
2665
2666        smp_mb();
2667        avg_runtime = fs_info->avg_delayed_ref_runtime;
2668        if (num_entries * avg_runtime >= NSEC_PER_SEC)
2669                return 1;
2670
2671        return btrfs_check_space_for_delayed_refs(trans, root);
2672}
2673
2674/*
2675 * this starts processing the delayed reference count updates and
2676 * extent insertions we have queued up so far.  count can be
2677 * 0, which means to process everything in the tree at the start
2678 * of the run (but not newly added entries), or it can be some target
2679 * number you'd like to process.
2680 *
2681 * Returns 0 on success or if called with an aborted transaction
2682 * Returns <0 on error and aborts the transaction
2683 */
2684int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2685                           struct btrfs_root *root, unsigned long count)
2686{
2687        struct rb_node *node;
2688        struct btrfs_delayed_ref_root *delayed_refs;
2689        struct btrfs_delayed_ref_head *head;
2690        int ret;
2691        int run_all = count == (unsigned long)-1;
2692        int run_most = 0;
2693
2694        /* We'll clean this up in btrfs_cleanup_transaction */
2695        if (trans->aborted)
2696                return 0;
2697
2698        if (root == root->fs_info->extent_root)
2699                root = root->fs_info->tree_root;
2700
2701        btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
2702
2703        delayed_refs = &trans->transaction->delayed_refs;
2704        if (count == 0) {
2705                count = atomic_read(&delayed_refs->num_entries) * 2;
2706                run_most = 1;
2707        }
2708
2709again:
2710#ifdef SCRAMBLE_DELAYED_REFS
2711        delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2712#endif
2713        ret = __btrfs_run_delayed_refs(trans, root, count);
2714        if (ret < 0) {
2715                btrfs_abort_transaction(trans, root, ret);
2716                return ret;
2717        }
2718
2719        if (run_all) {
2720                if (!list_empty(&trans->new_bgs))
2721                        btrfs_create_pending_block_groups(trans, root);
2722
2723                spin_lock(&delayed_refs->lock);
2724                node = rb_first(&delayed_refs->href_root);
2725                if (!node) {
2726                        spin_unlock(&delayed_refs->lock);
2727                        goto out;
2728                }
2729                count = (unsigned long)-1;
2730
2731                while (node) {
2732                        head = rb_entry(node, struct btrfs_delayed_ref_head,
2733                                        href_node);
2734                        if (btrfs_delayed_ref_is_head(&head->node)) {
2735                                struct btrfs_delayed_ref_node *ref;
2736
2737                                ref = &head->node;
2738                                atomic_inc(&ref->refs);
2739
2740                                spin_unlock(&delayed_refs->lock);
2741                                /*
2742                                 * Mutex was contended, block until it's
2743                                 * released and try again
2744                                 */
2745                                mutex_lock(&head->mutex);
2746                                mutex_unlock(&head->mutex);
2747
2748                                btrfs_put_delayed_ref(ref);
2749                                cond_resched();
2750                                goto again;
2751                        } else {
2752                                WARN_ON(1);
2753                        }
2754                        node = rb_next(node);
2755                }
2756                spin_unlock(&delayed_refs->lock);
2757                cond_resched();
2758                goto again;
2759        }
2760out:
2761        assert_qgroups_uptodate(trans);
2762        return 0;
2763}
2764
2765int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2766                                struct btrfs_root *root,
2767                                u64 bytenr, u64 num_bytes, u64 flags,
2768                                int level, int is_data)
2769{
2770        struct btrfs_delayed_extent_op *extent_op;
2771        int ret;
2772
2773        extent_op = btrfs_alloc_delayed_extent_op();
2774        if (!extent_op)
2775                return -ENOMEM;
2776
2777        extent_op->flags_to_set = flags;
2778        extent_op->update_flags = 1;
2779        extent_op->update_key = 0;
2780        extent_op->is_data = is_data ? 1 : 0;
2781        extent_op->level = level;
2782
2783        ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
2784                                          num_bytes, extent_op);
2785        if (ret)
2786                btrfs_free_delayed_extent_op(extent_op);
2787        return ret;
2788}
2789
2790static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
2791                                      struct btrfs_root *root,
2792                                      struct btrfs_path *path,
2793                                      u64 objectid, u64 offset, u64 bytenr)
2794{
2795        struct btrfs_delayed_ref_head *head;
2796        struct btrfs_delayed_ref_node *ref;
2797        struct btrfs_delayed_data_ref *data_ref;
2798        struct btrfs_delayed_ref_root *delayed_refs;
2799        struct rb_node *node;
2800        int ret = 0;
2801
2802        delayed_refs = &trans->transaction->delayed_refs;
2803        spin_lock(&delayed_refs->lock);
2804        head = btrfs_find_delayed_ref_head(trans, bytenr);
2805        if (!head) {
2806                spin_unlock(&delayed_refs->lock);
2807                return 0;
2808        }
2809
2810        if (!mutex_trylock(&head->mutex)) {
2811                atomic_inc(&head->node.refs);
2812                spin_unlock(&delayed_refs->lock);
2813
2814                btrfs_release_path(path);
2815
2816                /*
2817                 * Mutex was contended, block until it's released and let
2818                 * caller try again
2819                 */
2820                mutex_lock(&head->mutex);
2821                mutex_unlock(&head->mutex);
2822                btrfs_put_delayed_ref(&head->node);
2823                return -EAGAIN;
2824        }
2825        spin_unlock(&delayed_refs->lock);
2826
2827        spin_lock(&head->lock);
2828        node = rb_first(&head->ref_root);
2829        while (node) {
2830                ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2831                node = rb_next(node);
2832
2833                /* If it's a shared ref we know a cross reference exists */
2834                if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
2835                        ret = 1;
2836                        break;
2837                }
2838
2839                data_ref = btrfs_delayed_node_to_data_ref(ref);
2840
2841                /*
2842                 * If our ref doesn't match the one we're currently looking at
2843                 * then we have a cross reference.
2844                 */
2845                if (data_ref->root != root->root_key.objectid ||
2846                    data_ref->objectid != objectid ||
2847                    data_ref->offset != offset) {
2848                        ret = 1;
2849                        break;
2850                }
2851        }
2852        spin_unlock(&head->lock);
2853        mutex_unlock(&head->mutex);
2854        return ret;
2855}
2856
2857static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
2858                                        struct btrfs_root *root,
2859                                        struct btrfs_path *path,
2860                                        u64 objectid, u64 offset, u64 bytenr)
2861{
2862        struct btrfs_root *extent_root = root->fs_info->extent_root;
2863        struct extent_buffer *leaf;
2864        struct btrfs_extent_data_ref *ref;
2865        struct btrfs_extent_inline_ref *iref;
2866        struct btrfs_extent_item *ei;
2867        struct btrfs_key key;
2868        u32 item_size;
2869        int ret;
2870
2871        key.objectid = bytenr;
2872        key.offset = (u64)-1;
2873        key.type = BTRFS_EXTENT_ITEM_KEY;
2874
2875        ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2876        if (ret < 0)
2877                goto out;
2878        BUG_ON(ret == 0); /* Corruption */
2879
2880        ret = -ENOENT;
2881        if (path->slots[0] == 0)
2882                goto out;
2883
2884        path->slots[0]--;
2885        leaf = path->nodes[0];
2886        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2887
2888        if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
2889                goto out;
2890
2891        ret = 1;
2892        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2893#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2894        if (item_size < sizeof(*ei)) {
2895                WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
2896                goto out;
2897        }
2898#endif
2899        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2900
2901        if (item_size != sizeof(*ei) +
2902            btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
2903                goto out;
2904
2905        if (btrfs_extent_generation(leaf, ei) <=
2906            btrfs_root_last_snapshot(&root->root_item))
2907                goto out;
2908
2909        iref = (struct btrfs_extent_inline_ref *)(ei + 1);
2910        if (btrfs_extent_inline_ref_type(leaf, iref) !=
2911            BTRFS_EXTENT_DATA_REF_KEY)
2912                goto out;
2913
2914        ref = (struct btrfs_extent_data_ref *)(&iref->offset);
2915        if (btrfs_extent_refs(leaf, ei) !=
2916            btrfs_extent_data_ref_count(leaf, ref) ||
2917            btrfs_extent_data_ref_root(leaf, ref) !=
2918            root->root_key.objectid ||
2919            btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
2920            btrfs_extent_data_ref_offset(leaf, ref) != offset)
2921                goto out;
2922
2923        ret = 0;
2924out:
2925        return ret;
2926}
2927
2928int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
2929                          struct btrfs_root *root,
2930                          u64 objectid, u64 offset, u64 bytenr)
2931{
2932        struct btrfs_path *path;
2933        int ret;
2934        int ret2;
2935
2936        path = btrfs_alloc_path();
2937        if (!path)
2938                return -ENOENT;
2939
2940        do {
2941                ret = check_committed_ref(trans, root, path, objectid,
2942                                          offset, bytenr);
2943                if (ret && ret != -ENOENT)
2944                        goto out;
2945
2946                ret2 = check_delayed_ref(trans, root, path, objectid,
2947                                         offset, bytenr);
2948        } while (ret2 == -EAGAIN);
2949
2950        if (ret2 && ret2 != -ENOENT) {
2951                ret = ret2;
2952                goto out;
2953        }
2954
2955        if (ret != -ENOENT || ret2 != -ENOENT)
2956                ret = 0;
2957out:
2958        btrfs_free_path(path);
2959        if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
2960                WARN_ON(ret > 0);
2961        return ret;
2962}
2963
2964static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2965                           struct btrfs_root *root,
2966                           struct extent_buffer *buf,
2967                           int full_backref, int inc, int for_cow)
2968{
2969        u64 bytenr;
2970        u64 num_bytes;
2971        u64 parent;
2972        u64 ref_root;
2973        u32 nritems;
2974        struct btrfs_key key;
2975        struct btrfs_file_extent_item *fi;
2976        int i;
2977        int level;
2978        int ret = 0;
2979        int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
2980                            u64, u64, u64, u64, u64, u64, int);
2981
2982        ref_root = btrfs_header_owner(buf);
2983        nritems = btrfs_header_nritems(buf);
2984        level = btrfs_header_level(buf);
2985
2986        if (!root->ref_cows && level == 0)
2987                return 0;
2988
2989        if (inc)
2990                process_func = btrfs_inc_extent_ref;
2991        else
2992                process_func = btrfs_free_extent;
2993
2994        if (full_backref)
2995                parent = buf->start;
2996        else
2997                parent = 0;
2998
2999        for (i = 0; i < nritems; i++) {
3000                if (level == 0) {
3001                        btrfs_item_key_to_cpu(buf, &key, i);
3002                        if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
3003                                continue;
3004                        fi = btrfs_item_ptr(buf, i,
3005                                            struct btrfs_file_extent_item);
3006                        if (btrfs_file_extent_type(buf, fi) ==
3007                            BTRFS_FILE_EXTENT_INLINE)
3008                                continue;
3009                        bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
3010                        if (bytenr == 0)
3011                                continue;
3012
3013                        num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
3014                        key.offset -= btrfs_file_extent_offset(buf, fi);
3015                        ret = process_func(trans, root, bytenr, num_bytes,
3016                                           parent, ref_root, key.objectid,
3017                                           key.offset, for_cow);
3018                        if (ret)
3019                                goto fail;
3020                } else {
3021                        bytenr = btrfs_node_blockptr(buf, i);
3022                        num_bytes = btrfs_level_size(root, level - 1);
3023                        ret = process_func(trans, root, bytenr, num_bytes,
3024                                           parent, ref_root, level - 1, 0,
3025                                           for_cow);
3026                        if (ret)
3027                                goto fail;
3028                }
3029        }
3030        return 0;
3031fail:
3032        return ret;
3033}
3034
3035int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3036                  struct extent_buffer *buf, int full_backref, int for_cow)
3037{
3038        return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow);
3039}
3040
3041int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3042                  struct extent_buffer *buf, int full_backref, int for_cow)
3043{
3044        return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow);
3045}
3046
3047static int write_one_cache_group(struct btrfs_trans_handle *trans,
3048                                 struct btrfs_root *root,
3049                                 struct btrfs_path *path,
3050                                 struct btrfs_block_group_cache *cache)
3051{
3052        int ret;
3053        struct btrfs_root *extent_root = root->fs_info->extent_root;
3054        unsigned long bi;
3055        struct extent_buffer *leaf;
3056
3057        ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
3058        if (ret < 0)
3059                goto fail;
3060        BUG_ON(ret); /* Corruption */
3061
3062        leaf = path->nodes[0];
3063        bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
3064        write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
3065        btrfs_mark_buffer_dirty(leaf);
3066        btrfs_release_path(path);
3067fail:
3068        if (ret) {
3069                btrfs_abort_transaction(trans, root, ret);
3070                return ret;
3071        }
3072        return 0;
3073
3074}
3075
3076static struct btrfs_block_group_cache *
3077next_block_group(struct btrfs_root *root,
3078                 struct btrfs_block_group_cache *cache)
3079{
3080        struct rb_node *node;
3081        spin_lock(&root->fs_info->block_group_cache_lock);
3082        node = rb_next(&cache->cache_node);
3083        btrfs_put_block_group(cache);
3084        if (node) {
3085                cache = rb_entry(node, struct btrfs_block_group_cache,
3086                                 cache_node);
3087                btrfs_get_block_group(cache);
3088        } else
3089                cache = NULL;
3090        spin_unlock(&root->fs_info->block_group_cache_lock);
3091        return cache;
3092}
3093
3094static int cache_save_setup(struct btrfs_block_group_cache *block_group,
3095                            struct btrfs_trans_handle *trans,
3096                            struct btrfs_path *path)
3097{
3098        struct btrfs_root *root = block_group->fs_info->tree_root;
3099        struct inode *inode = NULL;
3100        u64 alloc_hint = 0;
3101        int dcs = BTRFS_DC_ERROR;
3102        int num_pages = 0;
3103        int retries = 0;
3104        int ret = 0;
3105
3106        /*
3107         * If this block group is smaller than 100 megs don't bother caching the
3108         * block group.
3109         */
3110        if (block_group->key.offset < (100 * 1024 * 1024)) {
3111                spin_lock(&block_group->lock);
3112                block_group->disk_cache_state = BTRFS_DC_WRITTEN;
3113                spin_unlock(&block_group->lock);
3114                return 0;
3115        }
3116
3117again:
3118        inode = lookup_free_space_inode(root, block_group, path);
3119        if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
3120                ret = PTR_ERR(inode);
3121                btrfs_release_path(path);
3122                goto out;
3123        }
3124
3125        if (IS_ERR(inode)) {
3126                BUG_ON(retries);
3127                retries++;
3128
3129                if (block_group->ro)
3130                        goto out_free;
3131
3132                ret = create_free_space_inode(root, trans, block_group, path);
3133                if (ret)
3134                        goto out_free;
3135                goto again;
3136        }
3137
3138        /* We've already setup this transaction, go ahead and exit */
3139        if (block_group->cache_generation == trans->transid &&
3140            i_size_read(inode)) {
3141                dcs = BTRFS_DC_SETUP;
3142                goto out_put;
3143        }
3144
3145        /*
3146         * We want to set the generation to 0, that way if anything goes wrong
3147         * from here on out we know not to trust this cache when we load up next
3148         * time.
3149         */
3150        BTRFS_I(inode)->generation = 0;
3151        ret = btrfs_update_inode(trans, root, inode);
3152        WARN_ON(ret);
3153
3154        if (i_size_read(inode) > 0) {
3155                ret = btrfs_check_trunc_cache_free_space(root,
3156                                        &root->fs_info->global_block_rsv);
3157                if (ret)
3158                        goto out_put;
3159
3160                ret = btrfs_truncate_free_space_cache(root, trans, inode);
3161                if (ret)
3162                        goto out_put;
3163        }
3164
3165        spin_lock(&block_group->lock);
3166        if (block_group->cached != BTRFS_CACHE_FINISHED ||
3167            !btrfs_test_opt(root, SPACE_CACHE)) {
3168                /*
3169                 * don't bother trying to write stuff out _if_
3170                 * a) we're not cached,
3171                 * b) we're with nospace_cache mount option.
3172                 */
3173                dcs = BTRFS_DC_WRITTEN;
3174                spin_unlock(&block_group->lock);
3175                goto out_put;
3176        }
3177        spin_unlock(&block_group->lock);
3178
3179        /*
3180         * Try to preallocate enough space based on how big the block group is.
3181         * Keep in mind this has to include any pinned space which could end up
3182         * taking up quite a bit since it's not folded into the other space
3183         * cache.
3184         */
3185        num_pages = (int)div64_u64(block_group->key.offset, 256 * 1024 * 1024);
3186        if (!num_pages)
3187                num_pages = 1;
3188
3189        num_pages *= 16;
3190        num_pages *= PAGE_CACHE_SIZE;
3191
3192        ret = btrfs_check_data_free_space(inode, num_pages);
3193        if (ret)
3194                goto out_put;
3195
3196        ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
3197                                              num_pages, num_pages,
3198                                              &alloc_hint);
3199        if (!ret)
3200                dcs = BTRFS_DC_SETUP;
3201        btrfs_free_reserved_data_space(inode, num_pages);
3202
3203out_put:
3204        iput(inode);
3205out_free:
3206        btrfs_release_path(path);
3207out:
3208        spin_lock(&block_group->lock);
3209        if (!ret && dcs == BTRFS_DC_SETUP)
3210                block_group->cache_generation = trans->transid;
3211        block_group->disk_cache_state = dcs;
3212        spin_unlock(&block_group->lock);
3213
3214        return ret;
3215}
3216
3217int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
3218                                   struct btrfs_root *root)
3219{
3220        struct btrfs_block_group_cache *cache;
3221        int err = 0;
3222        struct btrfs_path *path;
3223        u64 last = 0;
3224
3225        path = btrfs_alloc_path();
3226        if (!path)
3227                return -ENOMEM;
3228
3229again:
3230        while (1) {
3231                cache = btrfs_lookup_first_block_group(root->fs_info, last);
3232                while (cache) {
3233                        if (cache->disk_cache_state == BTRFS_DC_CLEAR)
3234                                break;
3235                        cache = next_block_group(root, cache);
3236                }
3237                if (!cache) {
3238                        if (last == 0)
3239                                break;
3240                        last = 0;
3241                        continue;
3242                }
3243                err = cache_save_setup(cache, trans, path);
3244                last = cache->key.objectid + cache->key.offset;
3245                btrfs_put_block_group(cache);
3246        }
3247
3248        while (1) {
3249                if (last == 0) {
3250                        err = btrfs_run_delayed_refs(trans, root,
3251                                                     (unsigned long)-1);
3252                        if (err) /* File system offline */
3253                                goto out;
3254                }
3255
3256                cache = btrfs_lookup_first_block_group(root->fs_info, last);
3257                while (cache) {
3258                        if (cache->disk_cache_state == BTRFS_DC_CLEAR) {
3259                                btrfs_put_block_group(cache);
3260                                goto again;
3261                        }
3262
3263                        if (cache->dirty)
3264                                break;
3265                        cache = next_block_group(root, cache);
3266                }
3267                if (!cache) {
3268                        if (last == 0)
3269                                break;
3270                        last = 0;
3271                        continue;
3272                }
3273
3274                if (cache->disk_cache_state == BTRFS_DC_SETUP)
3275                        cache->disk_cache_state = BTRFS_DC_NEED_WRITE;
3276                cache->dirty = 0;
3277                last = cache->key.objectid + cache->key.offset;
3278
3279                err = write_one_cache_group(trans, root, path, cache);
3280                btrfs_put_block_group(cache);
3281                if (err) /* File system offline */
3282                        goto out;
3283        }
3284
3285        while (1) {
3286                /*
3287                 * I don't think this is needed since we're just marking our
3288                 * preallocated extent as written, but just in case it can't
3289                 * hurt.
3290                 */
3291                if (last == 0) {
3292                        err = btrfs_run_delayed_refs(trans, root,
3293                                                     (unsigned long)-1);
3294                        if (err) /* File system offline */
3295                                goto out;
3296                }
3297
3298                cache = btrfs_lookup_first_block_group(root->fs_info, last);
3299                while (cache) {
3300                        /*
3301                         * Really this shouldn't happen, but it could if we
3302                         * couldn't write the entire preallocated extent and
3303                         * splitting the extent resulted in a new block.
3304                         */
3305                        if (cache->dirty) {
3306                                btrfs_put_block_group(cache);
3307                                goto again;
3308                        }
3309                        if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
3310                                break;
3311                        cache = next_block_group(root, cache);
3312                }
3313                if (!cache) {
3314                        if (last == 0)
3315                                break;
3316                        last = 0;
3317                        continue;
3318                }
3319
3320                err = btrfs_write_out_cache(root, trans, cache, path);
3321
3322                /*
3323                 * If we didn't have an error then the cache state is still
3324                 * NEED_WRITE, so we can set it to WRITTEN.
3325                 */
3326                if (!err && cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
3327                        cache->disk_cache_state = BTRFS_DC_WRITTEN;
3328                last = cache->key.objectid + cache->key.offset;
3329                btrfs_put_block_group(cache);
3330        }
3331out:
3332
3333        btrfs_free_path(path);
3334        return err;
3335}
3336
3337int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
3338{
3339        struct btrfs_block_group_cache *block_group;
3340        int readonly = 0;
3341
3342        block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
3343        if (!block_group || block_group->ro)
3344                readonly = 1;
3345        if (block_group)
3346                btrfs_put_block_group(block_group);
3347        return readonly;
3348}
3349
3350static const char *alloc_name(u64 flags)
3351{
3352        switch (flags) {
3353        case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA:
3354                return "mixed";
3355        case BTRFS_BLOCK_GROUP_METADATA:
3356                return "metadata";
3357        case BTRFS_BLOCK_GROUP_DATA:
3358                return "data";
3359        case BTRFS_BLOCK_GROUP_SYSTEM:
3360                return "system";
3361        default:
3362                WARN_ON(1);
3363                return "invalid-combination";
3364        };
3365}
3366
3367static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3368                             u64 total_bytes, u64 bytes_used,
3369                             struct btrfs_space_info **space_info)
3370{
3371        struct btrfs_space_info *found;
3372        int i;
3373        int factor;
3374        int ret;
3375
3376        if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
3377                     BTRFS_BLOCK_GROUP_RAID10))
3378                factor = 2;
3379        else
3380                factor = 1;
3381
3382        found = __find_space_info(info, flags);
3383        if (found) {
3384                spin_lock(&found->lock);
3385                found->total_bytes += total_bytes;
3386                found->disk_total += total_bytes * factor;
3387                found->bytes_used += bytes_used;
3388                found->disk_used += bytes_used * factor;
3389                found->full = 0;
3390                spin_unlock(&found->lock);
3391                *space_info = found;
3392                return 0;
3393        }
3394        found = kzalloc(sizeof(*found), GFP_NOFS);
3395        if (!found)
3396                return -ENOMEM;
3397
3398        ret = percpu_counter_init(&found->total_bytes_pinned, 0);
3399        if (ret) {
3400                kfree(found);
3401                return ret;
3402        }
3403
3404        for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
3405                INIT_LIST_HEAD(&found->block_groups[i]);
3406                kobject_init(&found->block_group_kobjs[i], &btrfs_raid_ktype);
3407        }
3408        init_rwsem(&found->groups_sem);
3409        spin_lock_init(&found->lock);
3410        found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
3411        found->total_bytes = total_bytes;
3412        found->disk_total = total_bytes * factor;
3413        found->bytes_used = bytes_used;
3414        found->disk_used = bytes_used * factor;
3415        found->bytes_pinned = 0;
3416        found->bytes_reserved = 0;
3417        found->bytes_readonly = 0;
3418        found->bytes_may_use = 0;
3419        found->full = 0;
3420        found->force_alloc = CHUNK_ALLOC_NO_FORCE;
3421        found->chunk_alloc = 0;
3422        found->flush = 0;
3423        init_waitqueue_head(&found->wait);
3424
3425        ret = kobject_init_and_add(&found->kobj, &space_info_ktype,
3426                                    info->space_info_kobj, "%s",
3427                                    alloc_name(found->flags));
3428        if (ret) {
3429                kfree(found);
3430                return ret;
3431        }
3432
3433        *space_info = found;
3434        list_add_rcu(&found->list, &info->space_info);
3435        if (flags & BTRFS_BLOCK_GROUP_DATA)
3436                info->data_sinfo = found;
3437
3438        return ret;
3439}
3440
3441static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
3442{
3443        u64 extra_flags = chunk_to_extended(flags) &
3444                                BTRFS_EXTENDED_PROFILE_MASK;
3445
3446        write_seqlock(&fs_info->profiles_lock);
3447        if (flags & BTRFS_BLOCK_GROUP_DATA)
3448                fs_info->avail_data_alloc_bits |= extra_flags;
3449        if (flags & BTRFS_BLOCK_GROUP_METADATA)
3450                fs_info->avail_metadata_alloc_bits |= extra_flags;
3451        if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3452                fs_info->avail_system_alloc_bits |= extra_flags;
3453        write_sequnlock(&fs_info->profiles_lock);
3454}
3455
3456/*
3457 * returns target flags in extended format or 0 if restripe for this
3458 * chunk_type is not in progress
3459 *
3460 * should be called with either volume_mutex or balance_lock held
3461 */
3462static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
3463{
3464        struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3465        u64 target = 0;
3466
3467        if (!bctl)
3468                return 0;
3469
3470        if (flags & BTRFS_BLOCK_GROUP_DATA &&
3471            bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3472                target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
3473        } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
3474                   bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3475                target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
3476        } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
3477                   bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3478                target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
3479        }
3480
3481        return target;
3482}
3483
3484/*
3485 * @flags: available profiles in extended format (see ctree.h)
3486 *
3487 * Returns reduced profile in chunk format.  If profile changing is in
3488 * progress (either running or paused) picks the target profile (if it's
3489 * already available), otherwise falls back to plain reducing.
3490 */
3491static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3492{
3493        /*
3494         * we add in the count of missing devices because we want
3495         * to make sure that any RAID levels on a degraded FS
3496         * continue to be honored.
3497         */
3498        u64 num_devices = root->fs_info->fs_devices->rw_devices +
3499                root->fs_info->fs_devices->missing_devices;
3500        u64 target;
3501        u64 tmp;
3502
3503        /*
3504         * see if restripe for this chunk_type is in progress, if so
3505         * try to reduce to the target profile
3506         */
3507        spin_lock(&root->fs_info->balance_lock);
3508        target = get_restripe_target(root->fs_info, flags);
3509        if (target) {
3510                /* pick target profile only if it's already available */
3511                if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
3512                        spin_unlock(&root->fs_info->balance_lock);
3513                        return extended_to_chunk(target);
3514                }
3515        }
3516        spin_unlock(&root->fs_info->balance_lock);
3517
3518        /* First, mask out the RAID levels which aren't possible */
3519        if (num_devices == 1)
3520                flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 |
3521                           BTRFS_BLOCK_GROUP_RAID5);
3522        if (num_devices < 3)
3523                flags &= ~BTRFS_BLOCK_GROUP_RAID6;
3524        if (num_devices < 4)
3525                flags &= ~BTRFS_BLOCK_GROUP_RAID10;
3526
3527        tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
3528                       BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 |
3529                       BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10);
3530        flags &= ~tmp;
3531
3532        if (tmp & BTRFS_BLOCK_GROUP_RAID6)
3533                tmp = BTRFS_BLOCK_GROUP_RAID6;
3534        else if (tmp & BTRFS_BLOCK_GROUP_RAID5)
3535                tmp = BTRFS_BLOCK_GROUP_RAID5;
3536        else if (tmp & BTRFS_BLOCK_GROUP_RAID10)
3537                tmp = BTRFS_BLOCK_GROUP_RAID10;
3538        else if (tmp & BTRFS_BLOCK_GROUP_RAID1)
3539                tmp = BTRFS_BLOCK_GROUP_RAID1;
3540        else if (tmp & BTRFS_BLOCK_GROUP_RAID0)
3541                tmp = BTRFS_BLOCK_GROUP_RAID0;
3542
3543        return extended_to_chunk(flags | tmp);
3544}
3545
3546static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags)
3547{
3548        unsigned seq;
3549        u64 flags;
3550
3551        do {
3552                flags = orig_flags;
3553                seq = read_seqbegin(&root->fs_info->profiles_lock);
3554
3555                if (flags & BTRFS_BLOCK_GROUP_DATA)
3556                        flags |= root->fs_info->avail_data_alloc_bits;
3557                else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3558                        flags |= root->fs_info->avail_system_alloc_bits;
3559                else if (flags & BTRFS_BLOCK_GROUP_METADATA)
3560                        flags |= root->fs_info->avail_metadata_alloc_bits;
3561        } while (read_seqretry(&root->fs_info->profiles_lock, seq));
3562
3563        return btrfs_reduce_alloc_profile(root, flags);
3564}
3565
3566u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
3567{
3568        u64 flags;
3569        u64 ret;
3570
3571        if (data)
3572                flags = BTRFS_BLOCK_GROUP_DATA;
3573        else if (root == root->fs_info->chunk_root)
3574                flags = BTRFS_BLOCK_GROUP_SYSTEM;
3575        else
3576                flags = BTRFS_BLOCK_GROUP_METADATA;
3577
3578        ret = get_alloc_profile(root, flags);
3579        return ret;
3580}
3581
3582/*
3583 * This will check the space that the inode allocates from to make sure we have
3584 * enough space for bytes.
3585 */
3586int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
3587{
3588        struct btrfs_space_info *data_sinfo;
3589        struct btrfs_root *root = BTRFS_I(inode)->root;
3590        struct btrfs_fs_info *fs_info = root->fs_info;
3591        u64 used;
3592        int ret = 0, committed = 0, alloc_chunk = 1;
3593
3594        /* make sure bytes are sectorsize aligned */
3595        bytes = ALIGN(bytes, root->sectorsize);
3596
3597        if (btrfs_is_free_space_inode(inode)) {
3598                committed = 1;
3599                ASSERT(current->journal_info);
3600        }
3601
3602        data_sinfo = fs_info->data_sinfo;
3603        if (!data_sinfo)
3604                goto alloc;
3605
3606again:
3607        /* make sure we have enough space to handle the data first */
3608        spin_lock(&data_sinfo->lock);
3609        used = data_sinfo->bytes_used + data_sinfo->bytes_reserved +
3610                data_sinfo->bytes_pinned + data_sinfo->bytes_readonly +
3611                data_sinfo->bytes_may_use;
3612
3613        if (used + bytes > data_sinfo->total_bytes) {
3614                struct btrfs_trans_handle *trans;
3615
3616                /*
3617                 * if we don't have enough free bytes in this space then we need
3618                 * to alloc a new chunk.
3619                 */
3620                if (!data_sinfo->full && alloc_chunk) {
3621                        u64 alloc_target;
3622
3623                        data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
3624                        spin_unlock(&data_sinfo->lock);
3625alloc:
3626                        alloc_target = btrfs_get_alloc_profile(root, 1);
3627                        /*
3628                         * It is ugly that we don't call nolock join
3629                         * transaction for the free space inode case here.
3630                         * But it is safe because we only do the data space
3631                         * reservation for the free space cache in the
3632                         * transaction context, the common join transaction
3633                         * just increase the counter of the current transaction
3634                         * handler, doesn't try to acquire the trans_lock of
3635                         * the fs.
3636                         */
3637                        trans = btrfs_join_transaction(root);
3638                        if (IS_ERR(trans))
3639                                return PTR_ERR(trans);
3640
3641                        ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3642                                             alloc_target,
3643                                             CHUNK_ALLOC_NO_FORCE);
3644                        btrfs_end_transaction(trans, root);
3645                        if (ret < 0) {
3646                                if (ret != -ENOSPC)
3647                                        return ret;
3648                                else
3649                                        goto commit_trans;
3650                        }
3651
3652                        if (!data_sinfo)
3653                                data_sinfo = fs_info->data_sinfo;
3654
3655                        goto again;
3656                }
3657
3658                /*
3659                 * If we don't have enough pinned space to deal with this
3660                 * allocation don't bother committing the transaction.
3661                 */
3662                if (percpu_counter_compare(&data_sinfo->total_bytes_pinned,
3663                                           bytes) < 0)
3664                        committed = 1;
3665                spin_unlock(&data_sinfo->lock);
3666
3667                /* commit the current transaction and try again */
3668commit_trans:
3669                if (!committed &&
3670                    !atomic_read(&root->fs_info->open_ioctl_trans)) {
3671                        committed = 1;
3672
3673                        trans = btrfs_join_transaction(root);
3674                        if (IS_ERR(trans))
3675                                return PTR_ERR(trans);
3676                        ret = btrfs_commit_transaction(trans, root);
3677                        if (ret)
3678                                return ret;
3679                        goto again;
3680                }
3681
3682                trace_btrfs_space_reservation(root->fs_info,
3683                                              "space_info:enospc",
3684                                              data_sinfo->flags, bytes, 1);
3685                return -ENOSPC;
3686        }
3687        data_sinfo->bytes_may_use += bytes;
3688        trace_btrfs_space_reservation(root->fs_info, "space_info",
3689                                      data_sinfo->flags, bytes, 1);
3690        spin_unlock(&data_sinfo->lock);
3691
3692        return 0;
3693}
3694
3695/*
3696 * Called if we need to clear a data reservation for this inode.
3697 */
3698void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3699{
3700        struct btrfs_root *root = BTRFS_I(inode)->root;
3701        struct btrfs_space_info *data_sinfo;
3702
3703        /* make sure bytes are sectorsize aligned */
3704        bytes = ALIGN(bytes, root->sectorsize);
3705
3706        data_sinfo = root->fs_info->data_sinfo;
3707        spin_lock(&data_sinfo->lock);
3708        WARN_ON(data_sinfo->bytes_may_use < bytes);
3709        data_sinfo->bytes_may_use -= bytes;
3710        trace_btrfs_space_reservation(root->fs_info, "space_info",
3711                                      data_sinfo->flags, bytes, 0);
3712        spin_unlock(&data_sinfo->lock);
3713}
3714
3715static void force_metadata_allocation(struct btrfs_fs_info *info)
3716{
3717        struct list_head *head = &info->space_info;
3718        struct btrfs_space_info *found;
3719
3720        rcu_read_lock();
3721        list_for_each_entry_rcu(found, head, list) {
3722                if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
3723                        found->force_alloc = CHUNK_ALLOC_FORCE;
3724        }
3725        rcu_read_unlock();
3726}
3727
3728static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
3729{
3730        return (global->size << 1);
3731}
3732
3733static int should_alloc_chunk(struct btrfs_root *root,
3734                              struct btrfs_space_info *sinfo, int force)
3735{
3736        struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3737        u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
3738        u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
3739        u64 thresh;
3740
3741        if (force == CHUNK_ALLOC_FORCE)
3742                return 1;
3743
3744        /*
3745         * We need to take into account the global rsv because for all intents
3746         * and purposes it's used space.  Don't worry about locking the
3747         * global_rsv, it doesn't change except when the transaction commits.
3748         */
3749        if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
3750                num_allocated += calc_global_rsv_need_space(global_rsv);
3751
3752        /*
3753         * in limited mode, we want to have some free space up to
3754         * about 1% of the FS size.
3755         */
3756        if (force == CHUNK_ALLOC_LIMITED) {
3757                thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
3758                thresh = max_t(u64, 64 * 1024 * 1024,
3759                               div_factor_fine(thresh, 1));
3760
3761                if (num_bytes - num_allocated < thresh)
3762                        return 1;
3763        }
3764
3765        if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8))
3766                return 0;
3767        return 1;
3768}
3769
3770static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
3771{
3772        u64 num_dev;
3773
3774        if (type & (BTRFS_BLOCK_GROUP_RAID10 |
3775                    BTRFS_BLOCK_GROUP_RAID0 |
3776                    BTRFS_BLOCK_GROUP_RAID5 |
3777                    BTRFS_BLOCK_GROUP_RAID6))
3778                num_dev = root->fs_info->fs_devices->rw_devices;
3779        else if (type & BTRFS_BLOCK_GROUP_RAID1)
3780                num_dev = 2;
3781        else
3782                num_dev = 1;    /* DUP or single */
3783
3784        /* metadata for updaing devices and chunk tree */
3785        return btrfs_calc_trans_metadata_size(root, num_dev + 1);
3786}
3787
3788static void check_system_chunk(struct btrfs_trans_handle *trans,
3789                               struct btrfs_root *root, u64 type)
3790{
3791        struct btrfs_space_info *info;
3792        u64 left;
3793        u64 thresh;
3794
3795        info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3796        spin_lock(&info->lock);
3797        left = info->total_bytes - info->bytes_used - info->bytes_pinned -
3798                info->bytes_reserved - info->bytes_readonly;
3799        spin_unlock(&info->lock);
3800
3801        thresh = get_system_chunk_thresh(root, type);
3802        if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) {
3803                btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu",
3804                        left, thresh, type);
3805                dump_space_info(info, 0, 0);
3806        }
3807
3808        if (left < thresh) {
3809                u64 flags;
3810
3811                flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0);
3812                btrfs_alloc_chunk(trans, root, flags);
3813        }
3814}
3815
3816static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3817                          struct btrfs_root *extent_root, u64 flags, int force)
3818{
3819        struct btrfs_space_info *space_info;
3820        struct btrfs_fs_info *fs_info = extent_root->fs_info;
3821        int wait_for_alloc = 0;
3822        int ret = 0;
3823
3824        /* Don't re-enter if we're already allocating a chunk */
3825        if (trans->allocating_chunk)
3826                return -ENOSPC;
3827
3828        space_info = __find_space_info(extent_root->fs_info, flags);
3829        if (!space_info) {
3830                ret = update_space_info(extent_root->fs_info, flags,
3831                                        0, 0, &space_info);
3832                BUG_ON(ret); /* -ENOMEM */
3833        }
3834        BUG_ON(!space_info); /* Logic error */
3835
3836again:
3837        spin_lock(&space_info->lock);
3838        if (force < space_info->force_alloc)
3839                force = space_info->force_alloc;
3840        if (space_info->full) {
3841                if (should_alloc_chunk(extent_root, space_info, force))
3842                        ret = -ENOSPC;
3843                else
3844                        ret = 0;
3845                spin_unlock(&space_info->lock);
3846                return ret;
3847        }
3848
3849        if (!should_alloc_chunk(extent_root, space_info, force)) {
3850                spin_unlock(&space_info->lock);
3851                return 0;
3852        } else if (space_info->chunk_alloc) {
3853                wait_for_alloc = 1;
3854        } else {
3855                space_info->chunk_alloc = 1;
3856        }
3857
3858        spin_unlock(&space_info->lock);
3859
3860        mutex_lock(&fs_info->chunk_mutex);
3861
3862        /*
3863         * The chunk_mutex is held throughout the entirety of a chunk
3864         * allocation, so once we've acquired the chunk_mutex we know that the
3865         * other guy is done and we need to recheck and see if we should
3866         * allocate.
3867         */
3868        if (wait_for_alloc) {
3869                mutex_unlock(&fs_info->chunk_mutex);
3870                wait_for_alloc = 0;
3871                goto again;
3872        }
3873
3874        trans->allocating_chunk = true;
3875
3876        /*
3877         * If we have mixed data/metadata chunks we want to make sure we keep
3878         * allocating mixed chunks instead of individual chunks.
3879         */
3880        if (btrfs_mixed_space_info(space_info))
3881                flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
3882
3883        /*
3884         * if we're doing a data chunk, go ahead and make sure that
3885         * we keep a reasonable number of metadata chunks allocated in the
3886         * FS as well.
3887         */
3888        if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
3889                fs_info->data_chunk_allocations++;
3890                if (!(fs_info->data_chunk_allocations %
3891                      fs_info->metadata_ratio))
3892                        force_metadata_allocation(fs_info);
3893        }
3894
3895        /*
3896         * Check if we have enough space in SYSTEM chunk because we may need
3897         * to update devices.
3898         */
3899        check_system_chunk(trans, extent_root, flags);
3900
3901        ret = btrfs_alloc_chunk(trans, extent_root, flags);
3902        trans->allocating_chunk = false;
3903
3904        spin_lock(&space_info->lock);
3905        if (ret < 0 && ret != -ENOSPC)
3906                goto out;
3907        if (ret)
3908                space_info->full = 1;
3909        else
3910                ret = 1;
3911
3912        space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
3913out:
3914        space_info->chunk_alloc = 0;
3915        spin_unlock(&space_info->lock);
3916        mutex_unlock(&fs_info->chunk_mutex);
3917        return ret;
3918}
3919
3920static int can_overcommit(struct btrfs_root *root,
3921                          struct btrfs_space_info *space_info, u64 bytes,
3922                          enum btrfs_reserve_flush_enum flush)
3923{
3924        struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3925        u64 profile = btrfs_get_alloc_profile(root, 0);
3926        u64 space_size;
3927        u64 avail;
3928        u64 used;
3929
3930        used = space_info->bytes_used + space_info->bytes_reserved +
3931                space_info->bytes_pinned + space_info->bytes_readonly;
3932
3933        /*
3934         * We only want to allow over committing if we have lots of actual space
3935         * free, but if we don't have enough space to handle the global reserve
3936         * space then we could end up having a real enospc problem when trying
3937         * to allocate a chunk or some other such important allocation.
3938         */
3939        spin_lock(&global_rsv->lock);
3940        space_size = calc_global_rsv_need_space(global_rsv);
3941        spin_unlock(&global_rsv->lock);
3942        if (used + space_size >= space_info->total_bytes)
3943                return 0;
3944
3945        used += space_info->bytes_may_use;
3946
3947        spin_lock(&root->fs_info->free_chunk_lock);
3948        avail = root->fs_info->free_chunk_space;
3949        spin_unlock(&root->fs_info->free_chunk_lock);
3950
3951        /*
3952         * If we have dup, raid1 or raid10 then only half of the free
3953         * space is actually useable.  For raid56, the space info used
3954         * doesn't include the parity drive, so we don't have to
3955         * change the math
3956         */
3957        if (profile & (BTRFS_BLOCK_GROUP_DUP |
3958                       BTRFS_BLOCK_GROUP_RAID1 |
3959                       BTRFS_BLOCK_GROUP_RAID10))
3960                avail >>= 1;
3961
3962        /*
3963         * If we aren't flushing all things, let us overcommit up to
3964         * 1/2th of the space. If we can flush, don't let us overcommit
3965         * too much, let it overcommit up to 1/8 of the space.
3966         */
3967        if (flush == BTRFS_RESERVE_FLUSH_ALL)
3968                avail >>= 3;
3969        else
3970                avail >>= 1;
3971
3972        if (used + bytes < space_info->total_bytes + avail)
3973                return 1;
3974        return 0;
3975}
3976
3977static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
3978                                         unsigned long nr_pages, int nr_items)
3979{
3980        struct super_block *sb = root->fs_info->sb;
3981
3982        if (down_read_trylock(&sb->s_umount)) {
3983                writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
3984                up_read(&sb->s_umount);
3985        } else {
3986                /*
3987                 * We needn't worry the filesystem going from r/w to r/o though
3988                 * we don't acquire ->s_umount mutex, because the filesystem
3989                 * should guarantee the delalloc inodes list be empty after
3990                 * the filesystem is readonly(all dirty pages are written to
3991                 * the disk).
3992                 */
3993                btrfs_start_delalloc_roots(root->fs_info, 0, nr_items);
3994                if (!current->journal_info)
3995                        btrfs_wait_ordered_roots(root->fs_info, nr_items);
3996        }
3997}
3998
3999static inline int calc_reclaim_items_nr(struct btrfs_root *root, u64 to_reclaim)
4000{
4001        u64 bytes;
4002        int nr;
4003
4004        bytes = btrfs_calc_trans_metadata_size(root, 1);
4005        nr = (int)div64_u64(to_reclaim, bytes);
4006        if (!nr)
4007                nr = 1;
4008        return nr;
4009}
4010
4011#define EXTENT_SIZE_PER_ITEM    (256 * 1024)
4012
4013/*
4014 * shrink metadata reservation for delalloc
4015 */
4016static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
4017                            bool wait_ordered)
4018{
4019        struct btrfs_block_rsv *block_rsv;
4020        struct btrfs_space_info *space_info;
4021        struct btrfs_trans_handle *trans;
4022        u64 delalloc_bytes;
4023        u64 max_reclaim;
4024        long time_left;
4025        unsigned long nr_pages;
4026        int loops;
4027        int items;
4028        enum btrfs_reserve_flush_enum flush;
4029
4030        /* Calc the number of the pages we need flush for space reservation */
4031        items = calc_reclaim_items_nr(root, to_reclaim);
4032        to_reclaim = items * EXTENT_SIZE_PER_ITEM;
4033
4034        trans = (struct btrfs_trans_handle *)current->journal_info;
4035        block_rsv = &root->fs_info->delalloc_block_rsv;
4036        space_info = block_rsv->space_info;
4037
4038        delalloc_bytes = percpu_counter_sum_positive(
4039                                                &root->fs_info->delalloc_bytes);
4040        if (delalloc_bytes == 0) {
4041                if (trans)
4042                        return;
4043                if (wait_ordered)
4044                        btrfs_wait_ordered_roots(root->fs_info, items);
4045                return;
4046        }
4047
4048        loops = 0;
4049        while (delalloc_bytes && loops < 3) {
4050                max_reclaim = min(delalloc_bytes, to_reclaim);
4051                nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
4052                btrfs_writeback_inodes_sb_nr(root, nr_pages, items);
4053                /*
4054                 * We need to wait for the async pages to actually start before
4055                 * we do anything.
4056                 */
4057                max_reclaim = atomic_read(&root->fs_info->async_delalloc_pages);
4058                if (!max_reclaim)
4059                        goto skip_async;
4060
4061                if (max_reclaim <= nr_pages)
4062                        max_reclaim = 0;
4063                else
4064                        max_reclaim -= nr_pages;
4065
4066                wait_event(root->fs_info->async_submit_wait,
4067                           atomic_read(&root->fs_info->async_delalloc_pages) <=
4068                           (int)max_reclaim);
4069skip_async:
4070                if (!trans)
4071                        flush = BTRFS_RESERVE_FLUSH_ALL;
4072                else
4073                        flush = BTRFS_RESERVE_NO_FLUSH;
4074                spin_lock(&space_info->lock);
4075                if (can_overcommit(root, space_info, orig, flush)) {
4076                        spin_unlock(&space_info->lock);
4077                        break;
4078                }
4079                spin_unlock(&space_info->lock);
4080
4081                loops++;
4082                if (wait_ordered && !trans) {
4083                        btrfs_wait_ordered_roots(root->fs_info, items);
4084                } else {
4085                        time_left = schedule_timeout_killable(1);
4086                        if (time_left)
4087                                break;
4088                }
4089                delalloc_bytes = percpu_counter_sum_positive(
4090                                                &root->fs_info->delalloc_bytes);
4091        }
4092}
4093
4094/**
4095 * maybe_commit_transaction - possibly commit the transaction if its ok to
4096 * @root - the root we're allocating for
4097 * @bytes - the number of bytes we want to reserve
4098 * @force - force the commit
4099 *
4100 * This will check to make sure that committing the transaction will actually
4101 * get us somewhere and then commit the transaction if it does.  Otherwise it
4102 * will return -ENOSPC.
4103 */
4104static int may_commit_transaction(struct btrfs_root *root,
4105                                  struct btrfs_space_info *space_info,
4106                                  u64 bytes, int force)
4107{
4108        struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv;
4109        struct btrfs_trans_handle *trans;
4110
4111        trans = (struct btrfs_trans_handle *)current->journal_info;
4112        if (trans)
4113                return -EAGAIN;
4114
4115        if (force)
4116                goto commit;
4117
4118        /* See if there is enough pinned space to make this reservation */
4119        if (percpu_counter_compare(&space_info->total_bytes_pinned,
4120                                   bytes) >= 0)
4121                goto commit;
4122
4123        /*
4124         * See if there is some space in the delayed insertion reservation for
4125         * this reservation.
4126         */
4127        if (space_info != delayed_rsv->space_info)
4128                return -ENOSPC;
4129
4130        spin_lock(&delayed_rsv->lock);
4131        if (percpu_counter_compare(&space_info->total_bytes_pinned,
4132                                   bytes - delayed_rsv->size) >= 0) {
4133                spin_unlock(&delayed_rsv->lock);
4134                return -ENOSPC;
4135        }
4136        spin_unlock(&delayed_rsv->lock);
4137
4138commit:
4139        trans = btrfs_join_transaction(root);
4140        if (IS_ERR(trans))
4141                return -ENOSPC;
4142
4143        return btrfs_commit_transaction(trans, root);
4144}
4145
4146enum flush_state {
4147        FLUSH_DELAYED_ITEMS_NR  =       1,
4148        FLUSH_DELAYED_ITEMS     =       2,
4149        FLUSH_DELALLOC          =       3,
4150        FLUSH_DELALLOC_WAIT     =       4,
4151        ALLOC_CHUNK             =       5,
4152        COMMIT_TRANS            =       6,
4153};
4154
4155static int flush_space(struct btrfs_root *root,
4156                       struct btrfs_space_info *space_info, u64 num_bytes,
4157                       u64 orig_bytes, int state)
4158{
4159        struct btrfs_trans_handle *trans;
4160        int nr;
4161        int ret = 0;
4162
4163        switch (state) {
4164        case FLUSH_DELAYED_ITEMS_NR:
4165        case FLUSH_DELAYED_ITEMS:
4166                if (state == FLUSH_DELAYED_ITEMS_NR)
4167                        nr = calc_reclaim_items_nr(root, num_bytes) * 2;
4168                else
4169                        nr = -1;
4170
4171                trans = btrfs_join_transaction(root);
4172                if (IS_ERR(trans)) {
4173                        ret = PTR_ERR(trans);
4174                        break;
4175                }
4176                ret = btrfs_run_delayed_items_nr(trans, root, nr);
4177                btrfs_end_transaction(trans, root);
4178                break;
4179        case FLUSH_DELALLOC:
4180        case FLUSH_DELALLOC_WAIT:
4181                shrink_delalloc(root, num_bytes * 2, orig_bytes,
4182                                state == FLUSH_DELALLOC_WAIT);
4183                break;
4184        case ALLOC_CHUNK:
4185                trans = btrfs_join_transaction(root);
4186                if (IS_ERR(trans)) {
4187                        ret = PTR_ERR(trans);
4188                        break;
4189                }
4190                ret = do_chunk_alloc(trans, root->fs_info->extent_root,
4191                                     btrfs_get_alloc_profile(root, 0),
4192                                     CHUNK_ALLOC_NO_FORCE);
4193                btrfs_end_transaction(trans, root);
4194                if (ret == -ENOSPC)
4195                        ret = 0;
4196                break;
4197        case COMMIT_TRANS:
4198                ret = may_commit_transaction(root, space_info, orig_bytes, 0);
4199                break;
4200        default:
4201                ret = -ENOSPC;
4202                break;
4203        }
4204
4205        return ret;
4206}
4207/**
4208 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
4209 * @root - the root we're allocating for
4210 * @block_rsv - the block_rsv we're allocating for
4211 * @orig_bytes - the number of bytes we want
4212 * @flush - whether or not we can flush to make our reservation
4213 *
4214 * This will reserve orgi_bytes number of bytes from the space info associated
4215 * with the block_rsv.  If there is not enough space it will make an attempt to
4216 * flush out space to make room.  It will do this by flushing delalloc if
4217 * possible or committing the transaction.  If flush is 0 then no attempts to
4218 * regain reservations will be made and this will fail if there is not enough
4219 * space already.
4220 */
4221static int reserve_metadata_bytes(struct btrfs_root *root,
4222                                  struct btrfs_block_rsv *block_rsv,
4223                                  u64 orig_bytes,
4224                                  enum btrfs_reserve_flush_enum flush)
4225{
4226        struct btrfs_space_info *space_info = block_rsv->space_info;
4227        u64 used;
4228        u64 num_bytes = orig_bytes;
4229        int flush_state = FLUSH_DELAYED_ITEMS_NR;
4230        int ret = 0;
4231        bool flushing = false;
4232
4233again:
4234        ret = 0;
4235        spin_lock(&space_info->lock);
4236        /*
4237         * We only want to wait if somebody other than us is flushing and we
4238         * are actually allowed to flush all things.
4239         */
4240        while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing &&
4241               space_info->flush) {
4242                spin_unlock(&space_info->lock);
4243                /*
4244                 * If we have a trans handle we can't wait because the flusher
4245                 * may have to commit the transaction, which would mean we would
4246                 * deadlock since we are waiting for the flusher to finish, but
4247                 * hold the current transaction open.
4248                 */
4249                if (current->journal_info)
4250                        return -EAGAIN;
4251                ret = wait_event_killable(space_info->wait, !space_info->flush);
4252                /* Must have been killed, return */
4253                if (ret)
4254                        return -EINTR;
4255
4256                spin_lock(&space_info->lock);
4257        }
4258
4259        ret = -ENOSPC;
4260        used = space_info->bytes_used + space_info->bytes_reserved +
4261                space_info->bytes_pinned + space_info->bytes_readonly +
4262                space_info->bytes_may_use;
4263
4264        /*
4265         * The idea here is that we've not already over-reserved the block group
4266         * then we can go ahead and save our reservation first and then start
4267         * flushing if we need to.  Otherwise if we've already overcommitted
4268         * lets start flushing stuff first and then come back and try to make
4269         * our reservation.
4270         */
4271        if (used <= space_info->total_bytes) {
4272                if (used + orig_bytes <= space_info->total_bytes) {
4273                        space_info->bytes_may_use += orig_bytes;
4274                        trace_btrfs_space_reservation(root->fs_info,
4275                                "space_info", space_info->flags, orig_bytes, 1);
4276                        ret = 0;
4277                } else {
4278                        /*
4279                         * Ok set num_bytes to orig_bytes since we aren't
4280                         * overocmmitted, this way we only try and reclaim what
4281                         * we need.
4282                         */
4283                        num_bytes = orig_bytes;
4284                }
4285        } else {
4286                /*
4287                 * Ok we're over committed, set num_bytes to the overcommitted
4288                 * amount plus the amount of bytes that we need for this
4289                 * reservation.
4290                 */
4291                num_bytes = used - space_info->total_bytes +
4292                        (orig_bytes * 2);
4293        }
4294
4295        if (ret && can_overcommit(root, space_info, orig_bytes, flush)) {
4296                space_info->bytes_may_use += orig_bytes;
4297                trace_btrfs_space_reservation(root->fs_info, "space_info",
4298                                              space_info->flags, orig_bytes,
4299                                              1);
4300                ret = 0;
4301        }
4302
4303        /*
4304         * Couldn't make our reservation, save our place so while we're trying
4305         * to reclaim space we can actually use it instead of somebody else
4306         * stealing it from us.
4307         *
4308         * We make the other tasks wait for the flush only when we can flush
4309         * all things.
4310         */
4311        if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
4312                flushing = true;
4313                space_info->flush = 1;
4314        }
4315
4316        spin_unlock(&space_info->lock);
4317
4318        if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
4319                goto out;
4320
4321        ret = flush_space(root, space_info, num_bytes, orig_bytes,
4322                          flush_state);
4323        flush_state++;
4324
4325        /*
4326         * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock
4327         * would happen. So skip delalloc flush.
4328         */
4329        if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
4330            (flush_state == FLUSH_DELALLOC ||
4331             flush_state == FLUSH_DELALLOC_WAIT))
4332                flush_state = ALLOC_CHUNK;
4333
4334        if (!ret)
4335                goto again;
4336        else if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
4337                 flush_state < COMMIT_TRANS)
4338                goto again;
4339        else if (flush == BTRFS_RESERVE_FLUSH_ALL &&
4340                 flush_state <= COMMIT_TRANS)
4341                goto again;
4342
4343out:
4344        if (ret == -ENOSPC &&
4345            unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
4346                struct btrfs_block_rsv *global_rsv =
4347                        &root->fs_info->global_block_rsv;
4348
4349                if (block_rsv != global_rsv &&
4350                    !block_rsv_use_bytes(global_rsv, orig_bytes))
4351                        ret = 0;
4352        }
4353        if (ret == -ENOSPC)
4354                trace_btrfs_space_reservation(root->fs_info,
4355                                              "space_info:enospc",
4356                                              space_info->flags, orig_bytes, 1);
4357        if (flushing) {
4358                spin_lock(&space_info->lock);
4359                space_info->flush = 0;
4360                wake_up_all(&space_info->wait);
4361                spin_unlock(&space_info->lock);
4362        }
4363        return ret;
4364}
4365
4366static struct btrfs_block_rsv *get_block_rsv(
4367                                        const struct btrfs_trans_handle *trans,
4368                                        const struct btrfs_root *root)
4369{
4370        struct btrfs_block_rsv *block_rsv = NULL;
4371
4372        if (root->ref_cows)
4373                block_rsv = trans->block_rsv;
4374
4375        if (root == root->fs_info->csum_root && trans->adding_csums)
4376                block_rsv = trans->block_rsv;
4377
4378        if (root == root->fs_info->uuid_root)
4379                block_rsv = trans->block_rsv;
4380
4381        if (!block_rsv)
4382                block_rsv = root->block_rsv;
4383
4384        if (!block_rsv)
4385                block_rsv = &root->fs_info->empty_block_rsv;
4386
4387        return block_rsv;
4388}
4389
4390static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
4391                               u64 num_bytes)
4392{
4393        int ret = -ENOSPC;
4394        spin_lock(&block_rsv->lock);
4395        if (block_rsv->reserved >= num_bytes) {
4396                block_rsv->reserved -= num_bytes;
4397                if (block_rsv->reserved < block_rsv->size)
4398                        block_rsv->full = 0;
4399                ret = 0;
4400        }
4401        spin_unlock(&block_rsv->lock);
4402        return ret;
4403}
4404
4405static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
4406                                u64 num_bytes, int update_size)
4407{
4408        spin_lock(&block_rsv->lock);
4409        block_rsv->reserved += num_bytes;
4410        if (update_size)
4411                block_rsv->size += num_bytes;
4412        else if (block_rsv->reserved >= block_rsv->size)
4413                block_rsv->full = 1;
4414        spin_unlock(&block_rsv->lock);
4415}
4416
4417int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
4418                             struct btrfs_block_rsv *dest, u64 num_bytes,
4419                             int min_factor)
4420{
4421        struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
4422        u64 min_bytes;
4423
4424        if (global_rsv->space_info != dest->space_info)
4425                return -ENOSPC;
4426
4427        spin_lock(&global_rsv->lock);
4428        min_bytes = div_factor(global_rsv->size, min_factor);
4429        if (global_rsv->reserved < min_bytes + num_bytes) {
4430                spin_unlock(&global_rsv->lock);
4431                return -ENOSPC;
4432        }
4433        global_rsv->reserved -= num_bytes;
4434        if (global_rsv->reserved < global_rsv->size)
4435                global_rsv->full = 0;
4436        spin_unlock(&global_rsv->lock);
4437
4438        block_rsv_add_bytes(dest, num_bytes, 1);
4439        return 0;
4440}
4441
4442static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
4443                                    struct btrfs_block_rsv *block_rsv,
4444                                    struct btrfs_block_rsv *dest, u64 num_bytes)
4445{
4446        struct btrfs_space_info *space_info = block_rsv->space_info;
4447
4448        spin_lock(&block_rsv->lock);
4449        if (num_bytes == (u64)-1)
4450                num_bytes = block_rsv->size;
4451        block_rsv->size -= num_bytes;
4452        if (block_rsv->reserved >= block_rsv->size) {
4453                num_bytes = block_rsv->reserved - block_rsv->size;
4454                block_rsv->reserved = block_rsv->size;
4455                block_rsv->full = 1;
4456        } else {
4457                num_bytes = 0;
4458        }
4459        spin_unlock(&block_rsv->lock);
4460
4461        if (num_bytes > 0) {
4462                if (dest) {
4463                        spin_lock(&dest->lock);
4464                        if (!dest->full) {
4465                                u64 bytes_to_add;
4466
4467                                bytes_to_add = dest->size - dest->reserved;
4468                                bytes_to_add = min(num_bytes, bytes_to_add);
4469                                dest->reserved += bytes_to_add;
4470                                if (dest->reserved >= dest->size)
4471                                        dest->full = 1;
4472                                num_bytes -= bytes_to_add;
4473                        }
4474                        spin_unlock(&dest->lock);
4475                }
4476                if (num_bytes) {
4477                        spin_lock(&space_info->lock);
4478                        space_info->bytes_may_use -= num_bytes;
4479                        trace_btrfs_space_reservation(fs_info, "space_info",
4480                                        space_info->flags, num_bytes, 0);
4481                        spin_unlock(&space_info->lock);
4482                }
4483        }
4484}
4485
4486static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
4487                                   struct btrfs_block_rsv *dst, u64 num_bytes)
4488{
4489        int ret;
4490
4491        ret = block_rsv_use_bytes(src, num_bytes);
4492        if (ret)
4493                return ret;
4494
4495        block_rsv_add_bytes(dst, num_bytes, 1);
4496        return 0;
4497}
4498
4499void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
4500{
4501        memset(rsv, 0, sizeof(*rsv));
4502        spin_lock_init(&rsv->lock);
4503        rsv->type = type;
4504}
4505
4506struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
4507                                              unsigned short type)
4508{
4509        struct btrfs_block_rsv *block_rsv;
4510        struct btrfs_fs_info *fs_info = root->fs_info;
4511
4512        block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
4513        if (!block_rsv)
4514                return NULL;
4515
4516        btrfs_init_block_rsv(block_rsv, type);
4517        block_rsv->space_info = __find_space_info(fs_info,
4518                                                  BTRFS_BLOCK_GROUP_METADATA);
4519        return block_rsv;
4520}
4521
4522void btrfs_free_block_rsv(struct btrfs_root *root,
4523                          struct btrfs_block_rsv *rsv)
4524{
4525        if (!rsv)
4526                return;
4527        btrfs_block_rsv_release(root, rsv, (u64)-1);
4528        kfree(rsv);
4529}
4530
4531int btrfs_block_rsv_add(struct btrfs_root *root,
4532                        struct btrfs_block_rsv *block_rsv, u64 num_bytes,
4533                        enum btrfs_reserve_flush_enum flush)
4534{
4535        int ret;
4536
4537        if (num_bytes == 0)
4538                return 0;
4539
4540        ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
4541        if (!ret) {
4542                block_rsv_add_bytes(block_rsv, num_bytes, 1);
4543                return 0;
4544        }
4545
4546        return ret;
4547}
4548
4549int btrfs_block_rsv_check(struct btrfs_root *root,
4550                          struct btrfs_block_rsv *block_rsv, int min_factor)
4551{
4552        u64 num_bytes = 0;
4553        int ret = -ENOSPC;
4554
4555        if (!block_rsv)
4556                return 0;
4557
4558        spin_lock(&block_rsv->lock);
4559        num_bytes = div_factor(block_rsv->size, min_factor);
4560        if (block_rsv->reserved >= num_bytes)
4561                ret = 0;
4562        spin_unlock(&block_rsv->lock);
4563
4564        return ret;
4565}
4566
4567int btrfs_block_rsv_refill(struct btrfs_root *root,
4568                           struct btrfs_block_rsv *block_rsv, u64 min_reserved,
4569                           enum btrfs_reserve_flush_enum flush)
4570{
4571        u64 num_bytes = 0;
4572        int ret = -ENOSPC;
4573
4574        if (!block_rsv)
4575                return 0;
4576
4577        spin_lock(&block_rsv->lock);
4578        num_bytes = min_reserved;
4579        if (block_rsv->reserved >= num_bytes)
4580                ret = 0;
4581        else
4582                num_bytes -= block_rsv->reserved;
4583        spin_unlock(&block_rsv->lock);
4584
4585        if (!ret)
4586                return 0;
4587
4588        ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
4589        if (!ret) {
4590                block_rsv_add_bytes(block_rsv, num_bytes, 0);
4591                return 0;
4592        }
4593
4594        return ret;
4595}
4596
4597int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
4598                            struct btrfs_block_rsv *dst_rsv,
4599                            u64 num_bytes)
4600{
4601        return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
4602}
4603
4604void btrfs_block_rsv_release(struct btrfs_root *root,
4605                             struct btrfs_block_rsv *block_rsv,
4606                             u64 num_bytes)
4607{
4608        struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
4609        if (global_rsv == block_rsv ||
4610            block_rsv->space_info != global_rsv->space_info)
4611                global_rsv = NULL;
4612        block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv,
4613                                num_bytes);
4614}
4615
4616/*
4617 * helper to calculate size of global block reservation.
4618 * the desired value is sum of space used by extent tree,
4619 * checksum tree and root tree
4620 */
4621static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
4622{
4623        struct btrfs_space_info *sinfo;
4624        u64 num_bytes;
4625        u64 meta_used;
4626        u64 data_used;
4627        int csum_size = btrfs_super_csum_size(fs_info->super_copy);
4628
4629        sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
4630        spin_lock(&sinfo->lock);
4631        data_used = sinfo->bytes_used;
4632        spin_unlock(&sinfo->lock);
4633
4634        sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4635        spin_lock(&sinfo->lock);
4636        if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA)
4637                data_used = 0;
4638        meta_used = sinfo->bytes_used;
4639        spin_unlock(&sinfo->lock);
4640
4641        num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
4642                    csum_size * 2;
4643        num_bytes += div64_u64(data_used + meta_used, 50);
4644
4645        if (num_bytes * 3 > meta_used)
4646                num_bytes = div64_u64(meta_used, 3);
4647
4648        return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10);
4649}
4650
4651static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
4652{
4653        struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
4654        struct btrfs_space_info *sinfo = block_rsv->space_info;
4655        u64 num_bytes;
4656
4657        num_bytes = calc_global_metadata_size(fs_info);
4658
4659        spin_lock(&sinfo->lock);
4660        spin_lock(&block_rsv->lock);
4661
4662        block_rsv->size = min_t(u64, num_bytes, 512 * 1024 * 1024);
4663
4664        num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
4665                    sinfo->bytes_reserved + sinfo->bytes_readonly +
4666                    sinfo->bytes_may_use;
4667
4668        if (sinfo->total_bytes > num_bytes) {
4669                num_bytes = sinfo->total_bytes - num_bytes;
4670                block_rsv->reserved += num_bytes;
4671                sinfo->bytes_may_use += num_bytes;
4672                trace_btrfs_space_reservation(fs_info, "space_info",
4673                                      sinfo->flags, num_bytes, 1);
4674        }
4675
4676        if (block_rsv->reserved >= block_rsv->size) {
4677                num_bytes = block_rsv->reserved - block_rsv->size;
4678                sinfo->bytes_may_use -= num_bytes;
4679                trace_btrfs_space_reservation(fs_info, "space_info",
4680                                      sinfo->flags, num_bytes, 0);
4681                block_rsv->reserved = block_rsv->size;
4682                block_rsv->full = 1;
4683        }
4684
4685        spin_unlock(&block_rsv->lock);
4686        spin_unlock(&sinfo->lock);
4687}
4688
4689static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
4690{
4691        struct btrfs_space_info *space_info;
4692
4693        space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4694        fs_info->chunk_block_rsv.space_info = space_info;
4695
4696        space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4697        fs_info->global_block_rsv.space_info = space_info;
4698        fs_info->delalloc_block_rsv.space_info = space_info;
4699        fs_info->trans_block_rsv.space_info = space_info;
4700        fs_info->empty_block_rsv.space_info = space_info;
4701        fs_info->delayed_block_rsv.space_info = space_info;
4702
4703        fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
4704        fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
4705        fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
4706        fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
4707        if (fs_info->quota_root)
4708                fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
4709        fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
4710
4711        update_global_block_rsv(fs_info);
4712}
4713
4714static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
4715{
4716        block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
4717                                (u64)-1);
4718        WARN_ON(fs_info->delalloc_block_rsv.size > 0);
4719        WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
4720        WARN_ON(fs_info->trans_block_rsv.size > 0);
4721        WARN_ON(fs_info->trans_block_rsv.reserved > 0);
4722        WARN_ON(fs_info->chunk_block_rsv.size > 0);
4723        WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
4724        WARN_ON(fs_info->delayed_block_rsv.size > 0);
4725        WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
4726}
4727
4728void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
4729                                  struct btrfs_root *root)
4730{
4731        if (!trans->block_rsv)
4732                return;
4733
4734        if (!trans->bytes_reserved)
4735                return;
4736
4737        trace_btrfs_space_reservation(root->fs_info, "transaction",
4738                                      trans->transid, trans->bytes_reserved, 0);
4739        btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
4740        trans->bytes_reserved = 0;
4741}
4742
4743/* Can only return 0 or -ENOSPC */
4744int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
4745                                  struct inode *inode)
4746{
4747        struct btrfs_root *root = BTRFS_I(inode)->root;
4748        struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
4749        struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
4750
4751        /*
4752         * We need to hold space in order to delete our orphan item once we've
4753         * added it, so this takes the reservation so we can release it later
4754         * when we are truly done with the orphan item.
4755         */
4756        u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
4757        trace_btrfs_space_reservation(root->fs_info, "orphan",
4758                                      btrfs_ino(inode), num_bytes, 1);
4759        return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
4760}
4761
4762void btrfs_orphan_release_metadata(struct inode *inode)
4763{
4764        struct btrfs_root *root = BTRFS_I(inode)->root;
4765        u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
4766        trace_btrfs_space_reservation(root->fs_info, "orphan",
4767                                      btrfs_ino(inode), num_bytes, 0);
4768        btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
4769}
4770
4771/*
4772 * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
4773 * root: the root of the parent directory
4774 * rsv: block reservation
4775 * items: the number of items that we need do reservation
4776 * qgroup_reserved: used to return the reserved size in qgroup
4777 *
4778 * This function is used to reserve the space for snapshot/subvolume
4779 * creation and deletion. Those operations are different with the
4780 * common file/directory operations, they change two fs/file trees
4781 * and root tree, the number of items that the qgroup reserves is
4782 * different with the free space reservation. So we can not use
4783 * the space reseravtion mechanism in start_transaction().
4784 */
4785int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
4786                                     struct btrfs_block_rsv *rsv,
4787                                     int items,
4788                                     u64 *qgroup_reserved,
4789                                     bool use_global_rsv)
4790{
4791        u64 num_bytes;
4792        int ret;
4793        struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
4794
4795        if (root->fs_info->quota_enabled) {
4796                /* One for parent inode, two for dir entries */
4797                num_bytes = 3 * root->leafsize;
4798                ret = btrfs_qgroup_reserve(root, num_bytes);
4799                if (ret)
4800                        return ret;
4801        } else {
4802                num_bytes = 0;
4803        }
4804
4805        *qgroup_reserved = num_bytes;
4806
4807        num_bytes = btrfs_calc_trans_metadata_size(root, items);
4808        rsv->space_info = __find_space_info(root->fs_info,
4809                                            BTRFS_BLOCK_GROUP_METADATA);
4810        ret = btrfs_block_rsv_add(root, rsv, num_bytes,
4811                                  BTRFS_RESERVE_FLUSH_ALL);
4812
4813        if (ret == -ENOSPC && use_global_rsv)
4814                ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes);
4815
4816        if (ret) {
4817                if (*qgroup_reserved)
4818                        btrfs_qgroup_free(root, *qgroup_reserved);
4819        }
4820
4821        return ret;
4822}
4823
4824void btrfs_subvolume_release_metadata(struct btrfs_root *root,
4825                                      struct btrfs_block_rsv *rsv,
4826                                      u64 qgroup_reserved)
4827{
4828        btrfs_block_rsv_release(root, rsv, (u64)-1);
4829        if (qgroup_reserved)
4830                btrfs_qgroup_free(root, qgroup_reserved);
4831}
4832
4833/**
4834 * drop_outstanding_extent - drop an outstanding extent
4835 * @inode: the inode we're dropping the extent for
4836 *
4837 * This is called when we are freeing up an outstanding extent, either called
4838 * after an error or after an extent is written.  This will return the number of
4839 * reserved extents that need to be freed.  This must be called with
4840 * BTRFS_I(inode)->lock held.
4841 */
4842static unsigned drop_outstanding_extent(struct inode *inode)
4843{
4844        unsigned drop_inode_space = 0;
4845        unsigned dropped_extents = 0;
4846
4847        BUG_ON(!BTRFS_I(inode)->outstanding_extents);
4848        BTRFS_I(inode)->outstanding_extents--;
4849
4850        if (BTRFS_I(inode)->outstanding_extents == 0 &&
4851            test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
4852                               &BTRFS_I(inode)->runtime_flags))
4853                drop_inode_space = 1;
4854
4855        /*
4856         * If we have more or the same amount of outsanding extents than we have
4857         * reserved then we need to leave the reserved extents count alone.
4858         */
4859        if (BTRFS_I(inode)->outstanding_extents >=
4860            BTRFS_I(inode)->reserved_extents)
4861                return drop_inode_space;
4862
4863        dropped_extents = BTRFS_I(inode)->reserved_extents -
4864                BTRFS_I(inode)->outstanding_extents;
4865        BTRFS_I(inode)->reserved_extents -= dropped_extents;
4866        return dropped_extents + drop_inode_space;
4867}
4868
4869/**
4870 * calc_csum_metadata_size - return the amount of metada space that must be
4871 *      reserved/free'd for the given bytes.
4872 * @inode: the inode we're manipulating
4873 * @num_bytes: the number of bytes in question
4874 * @reserve: 1 if we are reserving space, 0 if we are freeing space
4875 *
4876 * This adjusts the number of csum_bytes in the inode and then returns the
4877 * correct amount of metadata that must either be reserved or freed.  We
4878 * calculate how many checksums we can fit into one leaf and then divide the
4879 * number of bytes that will need to be checksumed by this value to figure out
4880 * how many checksums will be required.  If we are adding bytes then the number
4881 * may go up and we will return the number of additional bytes that must be
4882 * reserved.  If it is going down we will return the number of bytes that must
4883 * be freed.
4884 *
4885 * This must be called with BTRFS_I(inode)->lock held.
4886 */
4887static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
4888                                   int reserve)
4889{
4890        struct btrfs_root *root = BTRFS_I(inode)->root;
4891        u64 csum_size;
4892        int num_csums_per_leaf;
4893        int num_csums;
4894        int old_csums;
4895
4896        if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
4897            BTRFS_I(inode)->csum_bytes == 0)
4898                return 0;
4899
4900        old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
4901        if (reserve)
4902                BTRFS_I(inode)->csum_bytes += num_bytes;
4903        else
4904                BTRFS_I(inode)->csum_bytes -= num_bytes;
4905        csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
4906        num_csums_per_leaf = (int)div64_u64(csum_size,
4907                                            sizeof(struct btrfs_csum_item) +
4908                                            sizeof(struct btrfs_disk_key));
4909        num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
4910        num_csums = num_csums + num_csums_per_leaf - 1;
4911        num_csums = num_csums / num_csums_per_leaf;
4912
4913        old_csums = old_csums + num_csums_per_leaf - 1;
4914        old_csums = old_csums / num_csums_per_leaf;
4915
4916        /* No change, no need to reserve more */
4917        if (old_csums == num_csums)
4918                return 0;
4919
4920        if (reserve)
4921                return btrfs_calc_trans_metadata_size(root,
4922                                                      num_csums - old_csums);
4923
4924        return btrfs_calc_trans_metadata_size(root, old_csums - num_csums);
4925}
4926
4927int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4928{
4929        struct btrfs_root *root = BTRFS_I(inode)->root;
4930        struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
4931        u64 to_reserve = 0;
4932        u64 csum_bytes;
4933        unsigned nr_extents = 0;
4934        int extra_reserve = 0;
4935        enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
4936        int ret = 0;
4937        bool delalloc_lock = true;
4938        u64 to_free = 0;
4939        unsigned dropped;
4940
4941        /* If we are a free space inode we need to not flush since we will be in
4942         * the middle of a transaction commit.  We also don't need the delalloc
4943         * mutex since we won't race with anybody.  We need this mostly to make
4944         * lockdep shut its filthy mouth.
4945         */
4946        if (btrfs_is_free_space_inode(inode)) {
4947                flush = BTRFS_RESERVE_NO_FLUSH;
4948                delalloc_lock = false;
4949        }
4950
4951        if (flush != BTRFS_RESERVE_NO_FLUSH &&
4952            btrfs_transaction_in_commit(root->fs_info))
4953                schedule_timeout(1);
4954
4955        if (delalloc_lock)
4956                mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
4957
4958        num_bytes = ALIGN(num_bytes, root->sectorsize);
4959
4960        spin_lock(&BTRFS_I(inode)->lock);
4961        BTRFS_I(inode)->outstanding_extents++;
4962
4963        if (BTRFS_I(inode)->outstanding_extents >
4964            BTRFS_I(inode)->reserved_extents)
4965                nr_extents = BTRFS_I(inode)->outstanding_extents -
4966                        BTRFS_I(inode)->reserved_extents;
4967
4968        /*
4969         * Add an item to reserve for updating the inode when we complete the
4970         * delalloc io.
4971         */
4972        if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
4973                      &BTRFS_I(inode)->runtime_flags)) {
4974                nr_extents++;
4975                extra_reserve = 1;
4976        }
4977
4978        to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
4979        to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
4980        csum_bytes = BTRFS_I(inode)->csum_bytes;
4981        spin_unlock(&BTRFS_I(inode)->lock);
4982
4983        if (root->fs_info->quota_enabled) {
4984                ret = btrfs_qgroup_reserve(root, num_bytes +
4985                                           nr_extents * root->leafsize);
4986                if (ret)
4987                        goto out_fail;
4988        }
4989
4990        ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
4991        if (unlikely(ret)) {
4992                if (root->fs_info->quota_enabled)
4993                        btrfs_qgroup_free(root, num_bytes +
4994                                                nr_extents * root->leafsize);
4995                goto out_fail;
4996        }
4997
4998        spin_lock(&BTRFS_I(inode)->lock);
4999        if (extra_reserve) {
5000                set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
5001                        &BTRFS_I(inode)->runtime_flags);
5002                nr_extents--;
5003        }
5004        BTRFS_I(inode)->reserved_extents += nr_extents;
5005        spin_unlock(&BTRFS_I(inode)->lock);
5006
5007        if (delalloc_lock)
5008                mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
5009
5010        if (to_reserve)
5011                trace_btrfs_space_reservation(root->fs_info, "delalloc",
5012                                              btrfs_ino(inode), to_reserve, 1);
5013        block_rsv_add_bytes(block_rsv, to_reserve, 1);
5014
5015        return 0;
5016
5017out_fail:
5018        spin_lock(&BTRFS_I(inode)->lock);
5019        dropped = drop_outstanding_extent(inode);
5020        /*
5021         * If the inodes csum_bytes is the same as the original
5022         * csum_bytes then we know we haven't raced with any free()ers
5023         * so we can just reduce our inodes csum bytes and carry on.
5024         */
5025        if (BTRFS_I(inode)->csum_bytes == csum_bytes) {
5026                calc_csum_metadata_size(inode, num_bytes, 0);
5027        } else {
5028                u64 orig_csum_bytes = BTRFS_I(inode)->csum_bytes;
5029                u64 bytes;
5030
5031                /*
5032                 * This is tricky, but first we need to figure out how much we
5033                 * free'd from any free-ers that occured during this
5034                 * reservation, so we reset ->csum_bytes to the csum_bytes
5035                 * before we dropped our lock, and then call the free for the
5036                 * number of bytes that were freed while we were trying our
5037                 * reservation.
5038                 */
5039                bytes = csum_bytes - BTRFS_I(inode)->csum_bytes;
5040                BTRFS_I(inode)->csum_bytes = csum_bytes;
5041                to_free = calc_csum_metadata_size(inode, bytes, 0);
5042
5043
5044                /*
5045                 * Now we need to see how much we would have freed had we not
5046                 * been making this reservation and our ->csum_bytes were not
5047                 * artificially inflated.
5048                 */
5049                BTRFS_I(inode)->csum_bytes = csum_bytes - num_bytes;
5050                bytes = csum_bytes - orig_csum_bytes;
5051                bytes = calc_csum_metadata_size(inode, bytes, 0);
5052
5053                /*
5054                 * Now reset ->csum_bytes to what it should be.  If bytes is
5055                 * more than to_free then we would have free'd more space had we
5056                 * not had an artificially high ->csum_bytes, so we need to free
5057                 * the remainder.  If bytes is the same or less then we don't
5058                 * need to do anything, the other free-ers did the correct
5059                 * thing.
5060                 */
5061                BTRFS_I(inode)->csum_bytes = orig_csum_bytes - num_bytes;
5062                if (bytes > to_free)
5063                        to_free = bytes - to_free;
5064                else
5065                        to_free = 0;
5066        }
5067        spin_unlock(&BTRFS_I(inode)->lock);
5068        if (dropped)
5069                to_free += btrfs_calc_trans_metadata_size(root, dropped);
5070
5071        if (to_free) {
5072                btrfs_block_rsv_release(root, block_rsv, to_free);
5073                trace_btrfs_space_reservation(root->fs_info, "delalloc",
5074                                              btrfs_ino(inode), to_free, 0);
5075        }
5076        if (delalloc_lock)
5077                mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
5078        return ret;
5079}
5080
5081/**
5082 * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
5083 * @inode: the inode to release the reservation for
5084 * @num_bytes: the number of bytes we're releasing
5085 *
5086 * This will release the metadata reservation for an inode.  This can be called
5087 * once we complete IO for a given set of bytes to release their metadata
5088 * reservations.
5089 */
5090void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
5091{
5092        struct btrfs_root *root = BTRFS_I(inode)->root;
5093        u64 to_free = 0;
5094        unsigned dropped;
5095
5096        num_bytes = ALIGN(num_bytes, root->sectorsize);
5097        spin_lock(&BTRFS_I(inode)->lock);
5098        dropped = drop_outstanding_extent(inode);
5099
5100        if (num_bytes)
5101                to_free = calc_csum_metadata_size(inode, num_bytes, 0);
5102        spin_unlock(&BTRFS_I(inode)->lock);
5103        if (dropped > 0)
5104                to_free += btrfs_calc_trans_metadata_size(root, dropped);
5105
5106        trace_btrfs_space_reservation(root->fs_info, "delalloc",
5107                                      btrfs_ino(inode), to_free, 0);
5108        if (root->fs_info->quota_enabled) {
5109                btrfs_qgroup_free(root, num_bytes +
5110                                        dropped * root->leafsize);
5111        }
5112
5113        btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
5114                                to_free);
5115}
5116
5117/**
5118 * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
5119 * @inode: inode we're writing to
5120 * @num_bytes: the number of bytes we want to allocate
5121 *
5122 * This will do the following things
5123 *
5124 * o reserve space in the data space info for num_bytes
5125 * o reserve space in the metadata space info based on number of outstanding
5126 *   extents and how much csums will be needed
5127 * o add to the inodes ->delalloc_bytes
5128 * o add it to the fs_info's delalloc inodes list.
5129 *
5130 * This will return 0 for success and -ENOSPC if there is no space left.
5131 */
5132int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
5133{
5134        int ret;
5135
5136        ret = btrfs_check_data_free_space(inode, num_bytes);
5137        if (ret)
5138                return ret;
5139
5140        ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
5141        if (ret) {
5142                btrfs_free_reserved_data_space(inode, num_bytes);
5143                return ret;
5144        }
5145
5146        return 0;
5147}
5148
5149/**
5150 * btrfs_delalloc_release_space - release data and metadata space for delalloc
5151 * @inode: inode we're releasing space for
5152 * @num_bytes: the number of bytes we want to free up
5153 *
5154 * This must be matched with a call to btrfs_delalloc_reserve_space.  This is
5155 * called in the case that we don't need the metadata AND data reservations
5156 * anymore.  So if there is an error or we insert an inline extent.
5157 *
5158 * This function will release the metadata space that was not used and will
5159 * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
5160 * list if there are no delalloc bytes left.
5161 */
5162void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
5163{
5164        btrfs_delalloc_release_metadata(inode, num_bytes);
5165        btrfs_free_reserved_data_space(inode, num_bytes);
5166}
5167
5168static int update_block_group(struct btrfs_root *root,
5169                              u64 bytenr, u64 num_bytes, int alloc)
5170{
5171        struct btrfs_block_group_cache *cache = NULL;
5172        struct btrfs_fs_info *info = root->fs_info;
5173        u64 total = num_bytes;
5174        u64 old_val;
5175        u64 byte_in_group;
5176        int factor;
5177
5178        /* block accounting for super block */
5179        spin_lock(&info->delalloc_root_lock);
5180        old_val = btrfs_super_bytes_used(info->super_copy);
5181        if (alloc)
5182                old_val += num_bytes;
5183        else
5184                old_val -= num_bytes;
5185        btrfs_set_super_bytes_used(info->super_copy, old_val);
5186        spin_unlock(&info->delalloc_root_lock);
5187
5188        while (total) {
5189                cache = btrfs_lookup_block_group(info, bytenr);
5190                if (!cache)
5191                        return -ENOENT;
5192                if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
5193                                    BTRFS_BLOCK_GROUP_RAID1 |
5194                                    BTRFS_BLOCK_GROUP_RAID10))
5195                        factor = 2;
5196                else
5197                        factor = 1;
5198                /*
5199                 * If this block group has free space cache written out, we
5200                 * need to make sure to load it if we are removing space.  This
5201                 * is because we need the unpinning stage to actually add the
5202                 * space back to the block group, otherwise we will leak space.
5203                 */
5204                if (!alloc && cache->cached == BTRFS_CACHE_NO)
5205                        cache_block_group(cache, 1);
5206
5207                byte_in_group = bytenr - cache->key.objectid;
5208                WARN_ON(byte_in_group > cache->key.offset);
5209
5210                spin_lock(&cache->space_info->lock);
5211                spin_lock(&cache->lock);
5212
5213                if (btrfs_test_opt(root, SPACE_CACHE) &&
5214                    cache->disk_cache_state < BTRFS_DC_CLEAR)
5215                        cache->disk_cache_state = BTRFS_DC_CLEAR;
5216
5217                cache->dirty = 1;
5218                old_val = btrfs_block_group_used(&cache->item);
5219                num_bytes = min(total, cache->key.offset - byte_in_group);
5220                if (alloc) {
5221                        old_val += num_bytes;
5222                        btrfs_set_block_group_used(&cache->item, old_val);
5223                        cache->reserved -= num_bytes;
5224                        cache->space_info->bytes_reserved -= num_bytes;
5225                        cache->space_info->bytes_used += num_bytes;
5226                        cache->space_info->disk_used += num_bytes * factor;
5227                        spin_unlock(&cache->lock);
5228                        spin_unlock(&cache->space_info->lock);
5229                } else {
5230                        old_val -= num_bytes;
5231                        btrfs_set_block_group_used(&cache->item, old_val);
5232                        cache->pinned += num_bytes;
5233                        cache->space_info->bytes_pinned += num_bytes;
5234                        cache->space_info->bytes_used -= num_bytes;
5235                        cache->space_info->disk_used -= num_bytes * factor;
5236                        spin_unlock(&cache->lock);
5237                        spin_unlock(&cache->space_info->lock);
5238
5239                        set_extent_dirty(info->pinned_extents,
5240                                         bytenr, bytenr + num_bytes - 1,
5241                                         GFP_NOFS | __GFP_NOFAIL);
5242                }
5243                btrfs_put_block_group(cache);
5244                total -= num_bytes;
5245                bytenr += num_bytes;
5246        }
5247        return 0;
5248}
5249
5250static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
5251{
5252        struct btrfs_block_group_cache *cache;
5253        u64 bytenr;
5254
5255        spin_lock(&root->fs_info->block_group_cache_lock);
5256        bytenr = root->fs_info->first_logical_byte;
5257        spin_unlock(&root->fs_info->block_group_cache_lock);
5258
5259        if (bytenr < (u64)-1)
5260                return bytenr;
5261
5262        cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
5263        if (!cache)
5264                return 0;
5265
5266        bytenr = cache->key.objectid;
5267        btrfs_put_block_group(cache);
5268
5269        return bytenr;
5270}
5271
5272static int pin_down_extent(struct btrfs_root *root,
5273                           struct btrfs_block_group_cache *cache,
5274                           u64 bytenr, u64 num_bytes, int reserved)
5275{
5276        spin_lock(&cache->space_info->lock);
5277        spin_lock(&cache->lock);
5278        cache->pinned += num_bytes;
5279        cache->space_info->bytes_pinned += num_bytes;
5280        if (reserved) {
5281                cache->reserved -= num_bytes;
5282                cache->space_info->bytes_reserved -= num_bytes;
5283        }
5284        spin_unlock(&cache->lock);
5285        spin_unlock(&cache->space_info->lock);
5286
5287        set_extent_dirty(root->fs_info->pinned_extents, bytenr,
5288                         bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
5289        if (reserved)
5290                trace_btrfs_reserved_extent_free(root, bytenr, num_bytes);
5291        return 0;
5292}
5293
5294/*
5295 * this function must be called within transaction
5296 */
5297int btrfs_pin_extent(struct btrfs_root *root,
5298                     u64 bytenr, u64 num_bytes, int reserved)
5299{
5300        struct btrfs_block_group_cache *cache;
5301
5302        cache = btrfs_lookup_block_group(root->fs_info, bytenr);
5303        BUG_ON(!cache); /* Logic error */
5304
5305        pin_down_extent(root, cache, bytenr, num_bytes, reserved);
5306
5307        btrfs_put_block_group(cache);
5308        return 0;
5309}
5310
5311/*
5312 * this function must be called within transaction
5313 */
5314int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
5315                                    u64 bytenr, u64 num_bytes)
5316{
5317        struct btrfs_block_group_cache *cache;
5318        int ret;
5319
5320        cache = btrfs_lookup_block_group(root->fs_info, bytenr);
5321        if (!cache)
5322                return -EINVAL;
5323
5324        /*
5325         * pull in the free space cache (if any) so that our pin
5326         * removes the free space from the cache.  We have load_only set
5327         * to one because the slow code to read in the free extents does check
5328         * the pinned extents.
5329         */
5330        cache_block_group(cache, 1);
5331
5332        pin_down_extent(root, cache, bytenr, num_bytes, 0);
5333
5334        /* remove us from the free space cache (if we're there at all) */
5335        ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
5336        btrfs_put_block_group(cache);
5337        return ret;
5338}
5339
5340static int __exclude_logged_extent(struct btrfs_root *root, u64 start, u64 num_bytes)
5341{
5342        int ret;
5343        struct btrfs_block_group_cache *block_group;
5344        struct btrfs_caching_control *caching_ctl;
5345
5346        block_group = btrfs_lookup_block_group(root->fs_info, start);
5347        if (!block_group)
5348                return -EINVAL;
5349
5350        cache_block_group(block_group, 0);
5351        caching_ctl = get_caching_control(block_group);
5352
5353        if (!caching_ctl) {
5354                /* Logic error */
5355                BUG_ON(!block_group_cache_done(block_group));
5356                ret = btrfs_remove_free_space(block_group, start, num_bytes);
5357        } else {
5358                mutex_lock(&caching_ctl->mutex);
5359
5360                if (start >= caching_ctl->progress) {
5361                        ret = add_excluded_extent(root, start, num_bytes);
5362                } else if (start + num_bytes <= caching_ctl->progress) {
5363                        ret = btrfs_remove_free_space(block_group,
5364                                                      start, num_bytes);
5365                } else {
5366                        num_bytes = caching_ctl->progress - start;
5367                        ret = btrfs_remove_free_space(block_group,
5368                                                      start, num_bytes);
5369                        if (ret)
5370                                goto out_lock;
5371
5372                        num_bytes = (start + num_bytes) -
5373                                caching_ctl->progress;
5374                        start = caching_ctl->progress;
5375                        ret = add_excluded_extent(root, start, num_bytes);
5376                }
5377out_lock:
5378                mutex_unlock(&caching_ctl->mutex);
5379                put_caching_control(caching_ctl);
5380        }
5381        btrfs_put_block_group(block_group);
5382        return ret;
5383}
5384
5385int btrfs_exclude_logged_extents(struct btrfs_root *log,
5386                                 struct extent_buffer *eb)
5387{
5388        struct btrfs_file_extent_item *item;
5389        struct btrfs_key key;
5390        int found_type;
5391        int i;
5392
5393        if (!btrfs_fs_incompat(log->fs_info, MIXED_GROUPS))
5394                return 0;
5395
5396        for (i = 0; i < btrfs_header_nritems(eb); i++) {
5397                btrfs_item_key_to_cpu(eb, &key, i);
5398                if (key.type != BTRFS_EXTENT_DATA_KEY)
5399                        continue;
5400                item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
5401                found_type = btrfs_file_extent_type(eb, item);
5402                if (found_type == BTRFS_FILE_EXTENT_INLINE)
5403                        continue;
5404                if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
5405                        continue;
5406                key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
5407                key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
5408                __exclude_logged_extent(log, key.objectid, key.offset);
5409        }
5410
5411        return 0;
5412}
5413
5414/**
5415 * btrfs_update_reserved_bytes - update the block_group and space info counters
5416 * @cache:      The cache we are manipulating
5417 * @num_bytes:  The number of bytes in question
5418 * @reserve:    One of the reservation enums
5419 *
5420 * This is called by the allocator when it reserves space, or by somebody who is
5421 * freeing space that was never actually used on disk.  For example if you
5422 * reserve some space for a new leaf in transaction A and before transaction A
5423 * commits you free that leaf, you call this with reserve set to 0 in order to
5424 * clear the reservation.
5425 *
5426 * Metadata reservations should be called with RESERVE_ALLOC so we do the proper
5427 * ENOSPC accounting.  For data we handle the reservation through clearing the
5428 * delalloc bits in the io_tree.  We have to do this since we could end up
5429 * allocating less disk space for the amount of data we have reserved in the
5430 * case of compression.
5431 *
5432 * If this is a reservation and the block group has become read only we cannot
5433 * make the reservation and return -EAGAIN, otherwise this function always
5434 * succeeds.
5435 */
5436static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
5437                                       u64 num_bytes, int reserve)
5438{
5439        struct btrfs_space_info *space_info = cache->space_info;
5440        int ret = 0;
5441
5442        spin_lock(&space_info->lock);
5443        spin_lock(&cache->lock);
5444        if (reserve != RESERVE_FREE) {
5445                if (cache->ro) {
5446                        ret = -EAGAIN;
5447                } else {
5448                        cache->reserved += num_bytes;
5449                        space_info->bytes_reserved += num_bytes;
5450                        if (reserve == RESERVE_ALLOC) {
5451                                trace_btrfs_space_reservation(cache->fs_info,
5452                                                "space_info", space_info->flags,
5453                                                num_bytes, 0);
5454                                space_info->bytes_may_use -= num_bytes;
5455                        }
5456                }
5457        } else {
5458                if (cache->ro)
5459                        space_info->bytes_readonly += num_bytes;
5460                cache->reserved -= num_bytes;
5461                space_info->bytes_reserved -= num_bytes;
5462        }
5463        spin_unlock(&cache->lock);
5464        spin_unlock(&space_info->lock);
5465        return ret;
5466}
5467
5468void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
5469                                struct btrfs_root *root)
5470{
5471        struct btrfs_fs_info *fs_info = root->fs_info;
5472        struct btrfs_caching_control *next;
5473        struct btrfs_caching_control *caching_ctl;
5474        struct btrfs_block_group_cache *cache;
5475        struct btrfs_space_info *space_info;
5476
5477        down_write(&fs_info->commit_root_sem);
5478
5479        list_for_each_entry_safe(caching_ctl, next,
5480                                 &fs_info->caching_block_groups, list) {
5481                cache = caching_ctl->block_group;
5482                if (block_group_cache_done(cache)) {
5483                        cache->last_byte_to_unpin = (u64)-1;
5484                        list_del_init(&caching_ctl->list);
5485                        put_caching_control(caching_ctl);
5486                } else {
5487                        cache->last_byte_to_unpin = caching_ctl->progress;
5488                }
5489        }
5490
5491        if (fs_info->pinned_extents == &fs_info->freed_extents[0])
5492                fs_info->pinned_extents = &fs_info->freed_extents[1];
5493        else
5494                fs_info->pinned_extents = &fs_info->freed_extents[0];
5495
5496        up_write(&fs_info->commit_root_sem);
5497
5498        list_for_each_entry_rcu(space_info, &fs_info->space_info, list)
5499                percpu_counter_set(&space_info->total_bytes_pinned, 0);
5500
5501        update_global_block_rsv(fs_info);
5502}
5503
5504static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
5505{
5506        struct btrfs_fs_info *fs_info = root->fs_info;
5507        struct btrfs_block_group_cache *cache = NULL;
5508        struct btrfs_space_info *space_info;
5509        struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5510        u64 len;
5511        bool readonly;
5512
5513        while (start <= end) {
5514                readonly = false;
5515                if (!cache ||
5516                    start >= cache->key.objectid + cache->key.offset) {
5517                        if (cache)
5518                                btrfs_put_block_group(cache);
5519                        cache = btrfs_lookup_block_group(fs_info, start);
5520                        BUG_ON(!cache); /* Logic error */
5521                }
5522
5523                len = cache->key.objectid + cache->key.offset - start;
5524                len = min(len, end + 1 - start);
5525
5526                if (start < cache->last_byte_to_unpin) {
5527                        len = min(len, cache->last_byte_to_unpin - start);
5528                        btrfs_add_free_space(cache, start, len);
5529                }
5530
5531                start += len;
5532                space_info = cache->space_info;
5533
5534                spin_lock(&space_info->lock);
5535                spin_lock(&cache->lock);
5536                cache->pinned -= len;
5537                space_info->bytes_pinned -= len;
5538                if (cache->ro) {
5539                        space_info->bytes_readonly += len;
5540                        readonly = true;
5541                }
5542                spin_unlock(&cache->lock);
5543                if (!readonly && global_rsv->space_info == space_info) {
5544                        spin_lock(&global_rsv->lock);
5545                        if (!global_rsv->full) {
5546                                len = min(len, global_rsv->size -
5547                                          global_rsv->reserved);
5548                                global_rsv->reserved += len;
5549                                space_info->bytes_may_use += len;
5550                                if (global_rsv->reserved >= global_rsv->size)
5551                                        global_rsv->full = 1;
5552                        }
5553                        spin_unlock(&global_rsv->lock);
5554                }
5555                spin_unlock(&space_info->lock);
5556        }
5557
5558        if (cache)
5559                btrfs_put_block_group(cache);
5560        return 0;
5561}
5562
5563int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
5564                               struct btrfs_root *root)
5565{
5566        struct btrfs_fs_info *fs_info = root->fs_info;
5567        struct extent_io_tree *unpin;
5568        u64 start;
5569        u64 end;
5570        int ret;
5571
5572        if (trans->aborted)
5573                return 0;
5574
5575        if (fs_info->pinned_extents == &fs_info->freed_extents[0])
5576                unpin = &fs_info->freed_extents[1];
5577        else
5578                unpin = &fs_info->freed_extents[0];
5579
5580        while (1) {
5581                ret = find_first_extent_bit(unpin, 0, &start, &end,
5582                                            EXTENT_DIRTY, NULL);
5583                if (ret)
5584                        break;
5585
5586                if (btrfs_test_opt(root, DISCARD))
5587                        ret = btrfs_discard_extent(root, start,
5588                                                   end + 1 - start, NULL);
5589
5590                clear_extent_dirty(unpin, start, end, GFP_NOFS);
5591                unpin_extent_range(root, start, end);
5592                cond_resched();
5593        }
5594
5595        return 0;
5596}
5597
5598static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes,
5599                             u64 owner, u64 root_objectid)
5600{
5601        struct btrfs_space_info *space_info;
5602        u64 flags;
5603
5604        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
5605                if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
5606                        flags = BTRFS_BLOCK_GROUP_SYSTEM;
5607                else
5608                        flags = BTRFS_BLOCK_GROUP_METADATA;
5609        } else {
5610                flags = BTRFS_BLOCK_GROUP_DATA;
5611        }
5612
5613        space_info = __find_space_info(fs_info, flags);
5614        BUG_ON(!space_info); /* Logic bug */
5615        percpu_counter_add(&space_info->total_bytes_pinned, num_bytes);
5616}
5617
5618
5619static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5620                                struct btrfs_root *root,
5621                                u64 bytenr, u64 num_bytes, u64 parent,
5622                                u64 root_objectid, u64 owner_objectid,
5623                                u64 owner_offset, int refs_to_drop,
5624                                struct btrfs_delayed_extent_op *extent_op)
5625{
5626        struct btrfs_key key;
5627        struct btrfs_path *path;
5628        struct btrfs_fs_info *info = root->fs_info;
5629        struct btrfs_root *extent_root = info->extent_root;
5630        struct extent_buffer *leaf;
5631        struct btrfs_extent_item *ei;
5632        struct btrfs_extent_inline_ref *iref;
5633        int ret;
5634        int is_data;
5635        int extent_slot = 0;
5636        int found_extent = 0;
5637        int num_to_del = 1;
5638        u32 item_size;
5639        u64 refs;
5640        bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
5641                                                 SKINNY_METADATA);
5642
5643        path = btrfs_alloc_path();
5644        if (!path)
5645                return -ENOMEM;
5646
5647        path->reada = 1;
5648        path->leave_spinning = 1;
5649
5650        is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
5651        BUG_ON(!is_data && refs_to_drop != 1);
5652
5653        if (is_data)
5654                skinny_metadata = 0;
5655
5656        ret = lookup_extent_backref(trans, extent_root, path, &iref,
5657                                    bytenr, num_bytes, parent,
5658                                    root_objectid, owner_objectid,
5659                                    owner_offset);
5660        if (ret == 0) {
5661                extent_slot = path->slots[0];
5662                while (extent_slot >= 0) {
5663                        btrfs_item_key_to_cpu(path->nodes[0], &key,
5664                                              extent_slot);
5665                        if (key.objectid != bytenr)
5666                                break;
5667                        if (key.type == BTRFS_EXTENT_ITEM_KEY &&
5668                            key.offset == num_bytes) {
5669                                found_extent = 1;
5670                                break;
5671                        }
5672                        if (key.type == BTRFS_METADATA_ITEM_KEY &&
5673                            key.offset == owner_objectid) {
5674                                found_extent = 1;
5675                                break;
5676                        }
5677                        if (path->slots[0] - extent_slot > 5)
5678                                break;
5679                        extent_slot--;
5680                }
5681#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
5682                item_size = btrfs_item_size_nr(path->nodes[0], extent_slot);
5683                if (found_extent && item_size < sizeof(*ei))
5684                        found_extent = 0;
5685#endif
5686                if (!found_extent) {
5687                        BUG_ON(iref);
5688                        ret = remove_extent_backref(trans, extent_root, path,
5689                                                    NULL, refs_to_drop,
5690                                                    is_data);
5691                        if (ret) {
5692                                btrfs_abort_transaction(trans, extent_root, ret);
5693                                goto out;
5694                        }
5695                        btrfs_release_path(path);
5696                        path->leave_spinning = 1;
5697
5698                        key.objectid = bytenr;
5699                        key.type = BTRFS_EXTENT_ITEM_KEY;
5700                        key.offset = num_bytes;
5701
5702                        if (!is_data && skinny_metadata) {
5703                                key.type = BTRFS_METADATA_ITEM_KEY;
5704                                key.offset = owner_objectid;
5705                        }
5706
5707                        ret = btrfs_search_slot(trans, extent_root,
5708                                                &key, path, -1, 1);
5709                        if (ret > 0 && skinny_metadata && path->slots[0]) {
5710                                /*
5711                                 * Couldn't find our skinny metadata item,
5712                                 * see if we have ye olde extent item.
5713                                 */
5714                                path->slots[0]--;
5715                                btrfs_item_key_to_cpu(path->nodes[0], &key,
5716                                                      path->slots[0]);
5717                                if (key.objectid == bytenr &&
5718                                    key.type == BTRFS_EXTENT_ITEM_KEY &&
5719                                    key.offset == num_bytes)
5720                                        ret = 0;
5721                        }
5722
5723                        if (ret > 0 && skinny_metadata) {
5724                                skinny_metadata = false;
5725                                key.objectid = bytenr;
5726                                key.type = BTRFS_EXTENT_ITEM_KEY;
5727                                key.offset = num_bytes;
5728                                btrfs_release_path(path);
5729                                ret = btrfs_search_slot(trans, extent_root,
5730                                                        &key, path, -1, 1);
5731                        }
5732
5733                        if (ret) {
5734                                btrfs_err(info, "umm, got %d back from search, was looking for %llu",
5735                                        ret, bytenr);
5736                                if (ret > 0)
5737                                        btrfs_print_leaf(extent_root,
5738                                                         path->nodes[0]);
5739                        }
5740                        if (ret < 0) {
5741                                btrfs_abort_transaction(trans, extent_root, ret);
5742                                goto out;
5743                        }
5744                        extent_slot = path->slots[0];
5745                }
5746        } else if (WARN_ON(ret == -ENOENT)) {
5747                btrfs_print_leaf(extent_root, path->nodes[0]);
5748                btrfs_err(info,
5749                        "unable to find ref byte nr %llu parent %llu root %llu  owner %llu offset %llu",
5750                        bytenr, parent, root_objectid, owner_objectid,
5751                        owner_offset);
5752                btrfs_abort_transaction(trans, extent_root, ret);
5753                goto out;
5754        } else {
5755                btrfs_abort_transaction(trans, extent_root, ret);
5756                goto out;
5757        }
5758
5759        leaf = path->nodes[0];
5760        item_size = btrfs_item_size_nr(leaf, extent_slot);
5761#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
5762        if (item_size < sizeof(*ei)) {
5763                BUG_ON(found_extent || extent_slot != path->slots[0]);
5764                ret = convert_extent_item_v0(trans, extent_root, path,
5765                                             owner_objectid, 0);
5766                if (ret < 0) {
5767                        btrfs_abort_transaction(trans, extent_root, ret);
5768                        goto out;
5769                }
5770
5771                btrfs_release_path(path);
5772                path->leave_spinning = 1;
5773
5774                key.objectid = bytenr;
5775                key.type = BTRFS_EXTENT_ITEM_KEY;
5776                key.offset = num_bytes;
5777
5778                ret = btrfs_search_slot(trans, extent_root, &key, path,
5779                                        -1, 1);
5780                if (ret) {
5781                        btrfs_err(info, "umm, got %d back from search, was looking for %llu",
5782                                ret, bytenr);
5783                        btrfs_print_leaf(extent_root, path->nodes[0]);
5784                }
5785                if (ret < 0) {
5786                        btrfs_abort_transaction(trans, extent_root, ret);
5787                        goto out;
5788                }
5789
5790                extent_slot = path->slots[0];
5791                leaf = path->nodes[0];
5792                item_size = btrfs_item_size_nr(leaf, extent_slot);
5793        }
5794#endif
5795        BUG_ON(item_size < sizeof(*ei));
5796        ei = btrfs_item_ptr(leaf, extent_slot,
5797                            struct btrfs_extent_item);
5798        if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
5799            key.type == BTRFS_EXTENT_ITEM_KEY) {
5800                struct btrfs_tree_block_info *bi;
5801                BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
5802                bi = (struct btrfs_tree_block_info *)(ei + 1);
5803                WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
5804        }
5805
5806        refs = btrfs_extent_refs(leaf, ei);
5807        if (refs < refs_to_drop) {
5808                btrfs_err(info, "trying to drop %d refs but we only have %Lu "
5809                          "for bytenr %Lu\n", refs_to_drop, refs, bytenr);
5810                ret = -EINVAL;
5811                btrfs_abort_transaction(trans, extent_root, ret);
5812                goto out;
5813        }
5814        refs -= refs_to_drop;
5815
5816        if (refs > 0) {
5817                if (extent_op)
5818                        __run_delayed_extent_op(extent_op, leaf, ei);
5819                /*
5820                 * In the case of inline back ref, reference count will
5821                 * be updated by remove_extent_backref
5822                 */
5823                if (iref) {
5824                        BUG_ON(!found_extent);
5825                } else {
5826                        btrfs_set_extent_refs(leaf, ei, refs);
5827                        btrfs_mark_buffer_dirty(leaf);
5828                }
5829                if (found_extent) {
5830                        ret = remove_extent_backref(trans, extent_root, path,
5831                                                    iref, refs_to_drop,
5832                                                    is_data);
5833                        if (ret) {
5834                                btrfs_abort_transaction(trans, extent_root, ret);
5835                                goto out;
5836                        }
5837                }
5838                add_pinned_bytes(root->fs_info, -num_bytes, owner_objectid,
5839                                 root_objectid);
5840        } else {
5841                if (found_extent) {
5842                        BUG_ON(is_data && refs_to_drop !=
5843                               extent_data_ref_count(root, path, iref));
5844                        if (iref) {
5845                                BUG_ON(path->slots[0] != extent_slot);
5846                        } else {
5847                                BUG_ON(path->slots[0] != extent_slot + 1);
5848                                path->slots[0] = extent_slot;
5849                                num_to_del = 2;
5850                        }
5851                }
5852
5853                ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
5854                                      num_to_del);
5855                if (ret) {
5856                        btrfs_abort_transaction(trans, extent_root, ret);
5857                        goto out;
5858                }
5859                btrfs_release_path(path);
5860
5861                if (is_data) {
5862                        ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
5863                        if (ret) {
5864                                btrfs_abort_transaction(trans, extent_root, ret);
5865                                goto out;
5866                        }
5867                }
5868
5869                ret = update_block_group(root, bytenr, num_bytes, 0);
5870                if (ret) {
5871                        btrfs_abort_transaction(trans, extent_root, ret);
5872                        goto out;
5873                }
5874        }
5875out:
5876        btrfs_free_path(path);
5877        return ret;
5878}
5879
5880/*
5881 * when we free an block, it is possible (and likely) that we free the last
5882 * delayed ref for that extent as well.  This searches the delayed ref tree for
5883 * a given extent, and if there are no other delayed refs to be processed, it
5884 * removes it from the tree.
5885 */
5886static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
5887                                      struct btrfs_root *root, u64 bytenr)
5888{
5889        struct btrfs_delayed_ref_head *head;
5890        struct btrfs_delayed_ref_root *delayed_refs;
5891        int ret = 0;
5892
5893        delayed_refs = &trans->transaction->delayed_refs;
5894        spin_lock(&delayed_refs->lock);
5895        head = btrfs_find_delayed_ref_head(trans, bytenr);
5896        if (!head)
5897                goto out_delayed_unlock;
5898
5899        spin_lock(&head->lock);
5900        if (rb_first(&head->ref_root))
5901                goto out;
5902
5903        if (head->extent_op) {
5904                if (!head->must_insert_reserved)
5905                        goto out;
5906                btrfs_free_delayed_extent_op(head->extent_op);
5907                head->extent_op = NULL;
5908        }
5909
5910        /*
5911         * waiting for the lock here would deadlock.  If someone else has it
5912         * locked they are already in the process of dropping it anyway
5913         */
5914        if (!mutex_trylock(&head->mutex))
5915                goto out;
5916
5917        /*
5918         * at this point we have a head with no other entries.  Go
5919         * ahead and process it.
5920         */
5921        head->node.in_tree = 0;
5922        rb_erase(&head->href_node, &delayed_refs->href_root);
5923
5924        atomic_dec(&delayed_refs->num_entries);
5925
5926        /*
5927         * we don't take a ref on the node because we're removing it from the
5928         * tree, so we just steal the ref the tree was holding.
5929         */
5930        delayed_refs->num_heads--;
5931        if (head->processing == 0)
5932                delayed_refs->num_heads_ready--;
5933        head->processing = 0;
5934        spin_unlock(&head->lock);
5935        spin_unlock(&delayed_refs->lock);
5936
5937        BUG_ON(head->extent_op);
5938        if (head->must_insert_reserved)
5939                ret = 1;
5940
5941        mutex_unlock(&head->mutex);
5942        btrfs_put_delayed_ref(&head->node);
5943        return ret;
5944out:
5945        spin_unlock(&head->lock);
5946
5947out_delayed_unlock:
5948        spin_unlock(&delayed_refs->lock);
5949        return 0;
5950}
5951
5952void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
5953                           struct btrfs_root *root,
5954                           struct extent_buffer *buf,
5955                           u64 parent, int last_ref)
5956{
5957        struct btrfs_block_group_cache *cache = NULL;
5958        int pin = 1;
5959        int ret;
5960
5961        if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
5962                ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
5963                                        buf->start, buf->len,
5964                                        parent, root->root_key.objectid,
5965                                        btrfs_header_level(buf),
5966                                        BTRFS_DROP_DELAYED_REF, NULL, 0);
5967                BUG_ON(ret); /* -ENOMEM */
5968        }
5969
5970        if (!last_ref)
5971                return;
5972
5973        cache = btrfs_lookup_block_group(root->fs_info, buf->start);
5974
5975        if (btrfs_header_generation(buf) == trans->transid) {
5976                if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
5977                        ret = check_ref_cleanup(trans, root, buf->start);
5978                        if (!ret)
5979                                goto out;
5980                }
5981
5982                if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
5983                        pin_down_extent(root, cache, buf->start, buf->len, 1);
5984                        goto out;
5985                }
5986
5987                WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
5988
5989                btrfs_add_free_space(cache, buf->start, buf->len);
5990                btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
5991                trace_btrfs_reserved_extent_free(root, buf->start, buf->len);
5992                pin = 0;
5993        }
5994out:
5995        if (pin)
5996                add_pinned_bytes(root->fs_info, buf->len,
5997                                 btrfs_header_level(buf),
5998                                 root->root_key.objectid);
5999
6000        /*
6001         * Deleting the buffer, clear the corrupt flag since it doesn't matter
6002         * anymore.
6003         */
6004        clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
6005        btrfs_put_block_group(cache);
6006}
6007
6008/* Can return -ENOMEM */
6009int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
6010                      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
6011                      u64 owner, u64 offset, int for_cow)
6012{
6013        int ret;
6014        struct btrfs_fs_info *fs_info = root->fs_info;
6015
6016        add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid);
6017
6018        /*
6019         * tree log blocks never actually go into the extent allocation
6020         * tree, just update pinning info and exit early.
6021         */
6022        if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
6023                WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
6024                /* unlocks the pinned mutex */
6025                btrfs_pin_extent(root, bytenr, num_bytes, 1);
6026                ret = 0;
6027        } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
6028                ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
6029                                        num_bytes,
6030                                        parent, root_objectid, (int)owner,
6031                                        BTRFS_DROP_DELAYED_REF, NULL, for_cow);
6032        } else {
6033                ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
6034                                                num_bytes,
6035                                                parent, root_objectid, owner,
6036                                                offset, BTRFS_DROP_DELAYED_REF,
6037                                                NULL, for_cow);
6038        }
6039        return ret;
6040}
6041
6042static u64 stripe_align(struct btrfs_root *root,
6043                        struct btrfs_block_group_cache *cache,
6044                        u64 val, u64 num_bytes)
6045{
6046        u64 ret = ALIGN(val, root->stripesize);
6047        return ret;
6048}
6049
6050/*
6051 * when we wait for progress in the block group caching, its because
6052 * our allocation attempt failed at least once.  So, we must sleep
6053 * and let some progress happen before we try again.
6054 *
6055 * This function will sleep at least once waiting for new free space to
6056 * show up, and then it will check the block group free space numbers
6057 * for our min num_bytes.  Another option is to have it go ahead
6058 * and look in the rbtree for a free extent of a given size, but this
6059 * is a good start.
6060 *
6061 * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
6062 * any of the information in this block group.
6063 */
6064static noinline void
6065wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
6066                                u64 num_bytes)
6067{
6068        struct btrfs_caching_control *caching_ctl;
6069
6070        caching_ctl = get_caching_control(cache);
6071        if (!caching_ctl)
6072                return;
6073
6074        wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
6075                   (cache->free_space_ctl->free_space >= num_bytes));
6076
6077        put_caching_control(caching_ctl);
6078}
6079
6080static noinline int
6081wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
6082{
6083        struct btrfs_caching_control *caching_ctl;
6084        int ret = 0;
6085
6086        caching_ctl = get_caching_control(cache);
6087        if (!caching_ctl)
6088                return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
6089
6090        wait_event(caching_ctl->wait, block_group_cache_done(cache));
6091        if (cache->cached == BTRFS_CACHE_ERROR)
6092                ret = -EIO;
6093        put_caching_control(caching_ctl);
6094        return ret;
6095}
6096
6097int __get_raid_index(u64 flags)
6098{
6099        if (flags & BTRFS_BLOCK_GROUP_RAID10)
6100                return BTRFS_RAID_RAID10;
6101        else if (flags & BTRFS_BLOCK_GROUP_RAID1)
6102                return BTRFS_RAID_RAID1;
6103        else if (flags & BTRFS_BLOCK_GROUP_DUP)
6104                return BTRFS_RAID_DUP;
6105        else if (flags & BTRFS_BLOCK_GROUP_RAID0)
6106                return BTRFS_RAID_RAID0;
6107        else if (flags & BTRFS_BLOCK_GROUP_RAID5)
6108                return BTRFS_RAID_RAID5;
6109        else if (flags & BTRFS_BLOCK_GROUP_RAID6)
6110                return BTRFS_RAID_RAID6;
6111
6112        return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
6113}
6114
6115int get_block_group_index(struct btrfs_block_group_cache *cache)
6116{
6117        return __get_raid_index(cache->flags);
6118}
6119
6120static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = {
6121        [BTRFS_RAID_RAID10]     = "raid10",
6122        [BTRFS_RAID_RAID1]      = "raid1",
6123        [BTRFS_RAID_DUP]        = "dup",
6124        [BTRFS_RAID_RAID0]      = "raid0",
6125        [BTRFS_RAID_SINGLE]     = "single",
6126        [BTRFS_RAID_RAID5]      = "raid5",
6127        [BTRFS_RAID_RAID6]      = "raid6",
6128};
6129
6130static const char *get_raid_name(enum btrfs_raid_types type)
6131{
6132        if (type >= BTRFS_NR_RAID_TYPES)
6133                return NULL;
6134
6135        return btrfs_raid_type_names[type];
6136}
6137
6138enum btrfs_loop_type {
6139        LOOP_CACHING_NOWAIT = 0,
6140        LOOP_CACHING_WAIT = 1,
6141        LOOP_ALLOC_CHUNK = 2,
6142        LOOP_NO_EMPTY_SIZE = 3,
6143};
6144
6145/*
6146 * walks the btree of allocated extents and find a hole of a given size.
6147 * The key ins is changed to record the hole:
6148 * ins->objectid == start position
6149 * ins->flags = BTRFS_EXTENT_ITEM_KEY
6150 * ins->offset == the size of the hole.
6151 * Any available blocks before search_start are skipped.
6152 *
6153 * If there is no suitable free space, we will record the max size of
6154 * the free space extent currently.
6155 */
6156static noinline int find_free_extent(struct btrfs_root *orig_root,
6157                                     u64 num_bytes, u64 empty_size,
6158                                     u64 hint_byte, struct btrfs_key *ins,
6159                                     u64 flags)
6160{
6161        int ret = 0;
6162        struct btrfs_root *root = orig_root->fs_info->extent_root;
6163        struct btrfs_free_cluster *last_ptr = NULL;
6164        struct btrfs_block_group_cache *block_group = NULL;
6165        u64 search_start = 0;
6166        u64 max_extent_size = 0;
6167        int empty_cluster = 2 * 1024 * 1024;
6168        struct btrfs_space_info *space_info;
6169        int loop = 0;
6170        int index = __get_raid_index(flags);
6171        int alloc_type = (flags & BTRFS_BLOCK_GROUP_DATA) ?
6172                RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
6173        bool failed_cluster_refill = false;
6174        bool failed_alloc = false;
6175        bool use_cluster = true;
6176        bool have_caching_bg = false;
6177
6178        WARN_ON(num_bytes < root->sectorsize);
6179        btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
6180        ins->objectid = 0;
6181        ins->offset = 0;
6182
6183        trace_find_free_extent(orig_root, num_bytes, empty_size, flags);
6184
6185        space_info = __find_space_info(root->fs_info, flags);
6186        if (!space_info) {
6187                btrfs_err(root->fs_info, "No space info for %llu", flags);
6188                return -ENOSPC;
6189        }
6190
6191        /*
6192         * If the space info is for both data and metadata it means we have a
6193         * small filesystem and we can't use the clustering stuff.
6194         */
6195        if (btrfs_mixed_space_info(space_info))
6196                use_cluster = false;
6197
6198        if (flags & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
6199                last_ptr = &root->fs_info->meta_alloc_cluster;
6200                if (!btrfs_test_opt(root, SSD))
6201                        empty_cluster = 64 * 1024;
6202        }
6203
6204        if ((flags & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
6205            btrfs_test_opt(root, SSD)) {
6206                last_ptr = &root->fs_info->data_alloc_cluster;
6207        }
6208
6209        if (last_ptr) {
6210                spin_lock(&last_ptr->lock);
6211                if (last_ptr->block_group)
6212                        hint_byte = last_ptr->window_start;
6213                spin_unlock(&last_ptr->lock);
6214        }
6215
6216        search_start = max(search_start, first_logical_byte(root, 0));
6217        search_start = max(search_start, hint_byte);
6218
6219        if (!last_ptr)
6220                empty_cluster = 0;
6221
6222        if (search_start == hint_byte) {
6223                block_group = btrfs_lookup_block_group(root->fs_info,
6224                                                       search_start);
6225                /*
6226                 * we don't want to use the block group if it doesn't match our
6227                 * allocation bits, or if its not cached.
6228                 *
6229                 * However if we are re-searching with an ideal block group
6230                 * picked out then we don't care that the block group is cached.
6231                 */
6232                if (block_group && block_group_bits(block_group, flags) &&
6233                    block_group->cached != BTRFS_CACHE_NO) {
6234                        down_read(&space_info->groups_sem);
6235                        if (list_empty(&block_group->list) ||
6236                            block_group->ro) {
6237                                /*
6238                                 * someone is removing this block group,
6239                                 * we can't jump into the have_block_group
6240                                 * target because our list pointers are not
6241                                 * valid
6242                                 */
6243                                btrfs_put_block_group(block_group);
6244                                up_read(&space_info->groups_sem);
6245                        } else {
6246                                index = get_block_group_index(block_group);
6247                                goto have_block_group;
6248                        }
6249                } else if (block_group) {
6250                        btrfs_put_block_group(block_group);
6251                }
6252        }
6253search:
6254        have_caching_bg = false;
6255        down_read(&space_info->groups_sem);
6256        list_for_each_entry(block_group, &space_info->block_groups[index],
6257                            list) {
6258                u64 offset;
6259                int cached;
6260
6261                btrfs_get_block_group(block_group);
6262                search_start = block_group->key.objectid;
6263
6264                /*
6265                 * this can happen if we end up cycling through all the
6266                 * raid types, but we want to make sure we only allocate
6267                 * for the proper type.
6268                 */
6269                if (!block_group_bits(block_group, flags)) {
6270                    u64 extra = BTRFS_BLOCK_GROUP_DUP |
6271                                BTRFS_BLOCK_GROUP_RAID1 |
6272                                BTRFS_BLOCK_GROUP_RAID5 |
6273                                BTRFS_BLOCK_GROUP_RAID6 |
6274                                BTRFS_BLOCK_GROUP_RAID10;
6275
6276                        /*
6277                         * if they asked for extra copies and this block group
6278                         * doesn't provide them, bail.  This does allow us to
6279                         * fill raid0 from raid1.
6280                         */
6281                        if ((flags & extra) && !(block_group->flags & extra))
6282                                goto loop;
6283                }
6284
6285have_block_group:
6286                cached = block_group_cache_done(block_group);
6287                if (unlikely(!cached)) {
6288                        ret = cache_block_group(block_group, 0);
6289                        BUG_ON(ret < 0);
6290                        ret = 0;
6291                }
6292
6293                if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
6294                        goto loop;
6295                if (unlikely(block_group->ro))
6296                        goto loop;
6297
6298                /*
6299                 * Ok we want to try and use the cluster allocator, so
6300                 * lets look there
6301                 */
6302                if (last_ptr) {
6303                        struct btrfs_block_group_cache *used_block_group;
6304                        unsigned long aligned_cluster;
6305                        /*
6306                         * the refill lock keeps out other
6307                         * people trying to start a new cluster
6308                         */
6309                        spin_lock(&last_ptr->refill_lock);
6310                        used_block_group = last_ptr->block_group;
6311                        if (used_block_group != block_group &&
6312                            (!used_block_group ||
6313                             used_block_group->ro ||
6314                             !block_group_bits(used_block_group, flags)))
6315                                goto refill_cluster;
6316
6317                        if (used_block_group != block_group)
6318                                btrfs_get_block_group(used_block_group);
6319
6320                        offset = btrfs_alloc_from_cluster(used_block_group,
6321                                                last_ptr,
6322                                                num_bytes,
6323                                                used_block_group->key.objectid,
6324                                                &max_extent_size);
6325                        if (offset) {
6326                                /* we have a block, we're done */
6327                                spin_unlock(&last_ptr->refill_lock);
6328                                trace_btrfs_reserve_extent_cluster(root,
6329                                                used_block_group,
6330                                                search_start, num_bytes);
6331                                if (used_block_group != block_group) {
6332                                        btrfs_put_block_group(block_group);
6333                                        block_group = used_block_group;
6334                                }
6335                                goto checks;
6336                        }
6337
6338                        WARN_ON(last_ptr->block_group != used_block_group);
6339                        if (used_block_group != block_group)
6340                                btrfs_put_block_group(used_block_group);
6341refill_cluster:
6342                        /* If we are on LOOP_NO_EMPTY_SIZE, we can't
6343                         * set up a new clusters, so lets just skip it
6344                         * and let the allocator find whatever block
6345                         * it can find.  If we reach this point, we
6346                         * will have tried the cluster allocator
6347                         * plenty of times and not have found
6348                         * anything, so we are likely way too
6349                         * fragmented for the clustering stuff to find
6350                         * anything.
6351                         *
6352                         * However, if the cluster is taken from the
6353                         * current block group, release the cluster
6354                         * first, so that we stand a better chance of
6355                         * succeeding in the unclustered
6356                         * allocation.  */
6357                        if (loop >= LOOP_NO_EMPTY_SIZE &&
6358                            last_ptr->block_group != block_group) {
6359                                spin_unlock(&last_ptr->refill_lock);
6360                                goto unclustered_alloc;
6361                        }
6362
6363                        /*
6364                         * this cluster didn't work out, free it and
6365                         * start over
6366                         */
6367                        btrfs_return_cluster_to_free_space(NULL, last_ptr);
6368
6369                        if (loop >= LOOP_NO_EMPTY_SIZE) {
6370                                spin_unlock(&last_ptr->refill_lock);
6371                                goto unclustered_alloc;
6372                        }
6373
6374                        aligned_cluster = max_t(unsigned long,
6375                                                empty_cluster + empty_size,
6376                                              block_group->full_stripe_len);
6377
6378                        /* allocate a cluster in this block group */
6379                        ret = btrfs_find_space_cluster(root, block_group,
6380                                                       last_ptr, search_start,
6381                                                       num_bytes,
6382                                                       aligned_cluster);
6383                        if (ret == 0) {
6384                                /*
6385                                 * now pull our allocation out of this
6386                                 * cluster
6387                                 */
6388                                offset = btrfs_alloc_from_cluster(block_group,
6389                                                        last_ptr,
6390                                                        num_bytes,
6391                                                        search_start,
6392                                                        &max_extent_size);
6393                                if (offset) {
6394                                        /* we found one, proceed */
6395                                        spin_unlock(&last_ptr->refill_lock);
6396                                        trace_btrfs_reserve_extent_cluster(root,
6397                                                block_group, search_start,
6398                                                num_bytes);
6399                                        goto checks;
6400                                }
6401                        } else if (!cached && loop > LOOP_CACHING_NOWAIT
6402                                   && !failed_cluster_refill) {
6403                                spin_unlock(&last_ptr->refill_lock);
6404
6405                                failed_cluster_refill = true;
6406                                wait_block_group_cache_progress(block_group,
6407                                       num_bytes + empty_cluster + empty_size);
6408                                goto have_block_group;
6409                        }
6410
6411                        /*
6412                         * at this point we either didn't find a cluster
6413                         * or we weren't able to allocate a block from our
6414                         * cluster.  Free the cluster we've been trying
6415                         * to use, and go to the next block group
6416                         */
6417                        btrfs_return_cluster_to_free_space(NULL, last_ptr);
6418                        spin_unlock(&last_ptr->refill_lock);
6419                        goto loop;
6420                }
6421
6422unclustered_alloc:
6423                spin_lock(&block_group->free_space_ctl->tree_lock);
6424                if (cached &&
6425                    block_group->free_space_ctl->free_space <
6426                    num_bytes + empty_cluster + empty_size) {
6427                        if (block_group->free_space_ctl->free_space >
6428                            max_extent_size)
6429                                max_extent_size =
6430                                        block_group->free_space_ctl->free_space;
6431                        spin_unlock(&block_group->free_space_ctl->tree_lock);
6432                        goto loop;
6433                }
6434                spin_unlock(&block_group->free_space_ctl->tree_lock);
6435
6436                offset = btrfs_find_space_for_alloc(block_group, search_start,
6437                                                    num_bytes, empty_size,
6438                                                    &max_extent_size);
6439                /*
6440                 * If we didn't find a chunk, and we haven't failed on this
6441                 * block group before, and this block group is in the middle of
6442                 * caching and we are ok with waiting, then go ahead and wait
6443                 * for progress to be made, and set failed_alloc to true.
6444                 *
6445                 * If failed_alloc is true then we've already waited on this
6446                 * block group once and should move on to the next block group.
6447                 */
6448                if (!offset && !failed_alloc && !cached &&
6449                    loop > LOOP_CACHING_NOWAIT) {
6450                        wait_block_group_cache_progress(block_group,
6451                                                num_bytes + empty_size);
6452                        failed_alloc = true;
6453                        goto have_block_group;
6454                } else if (!offset) {
6455                        if (!cached)
6456                                have_caching_bg = true;
6457                        goto loop;
6458                }
6459checks:
6460                search_start = stripe_align(root, block_group,
6461                                            offset, num_bytes);
6462
6463                /* move on to the next group */
6464                if (search_start + num_bytes >
6465                    block_group->key.objectid + block_group->key.offset) {
6466                        btrfs_add_free_space(block_group, offset, num_bytes);
6467                        goto loop;
6468                }
6469
6470                if (offset < search_start)
6471                        btrfs_add_free_space(block_group, offset,
6472                                             search_start - offset);
6473                BUG_ON(offset > search_start);
6474
6475                ret = btrfs_update_reserved_bytes(block_group, num_bytes,
6476                                                  alloc_type);
6477                if (ret == -EAGAIN) {
6478                        btrfs_add_free_space(block_group, offset, num_bytes);
6479                        goto loop;
6480                }
6481
6482                /* we are all good, lets return */
6483                ins->objectid = search_start;
6484                ins->offset = num_bytes;
6485
6486                trace_btrfs_reserve_extent(orig_root, block_group,
6487                                           search_start, num_bytes);
6488                btrfs_put_block_group(block_group);
6489                break;
6490loop:
6491                failed_cluster_refill = false;
6492                failed_alloc = false;
6493                BUG_ON(index != get_block_group_index(block_group));
6494                btrfs_put_block_group(block_group);
6495        }
6496        up_read(&space_info->groups_sem);
6497
6498        if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
6499                goto search;
6500
6501        if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
6502                goto search;
6503
6504        /*
6505         * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
6506         *                      caching kthreads as we move along
6507         * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
6508         * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
6509         * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
6510         *                      again
6511         */
6512        if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
6513                index = 0;
6514                loop++;
6515                if (loop == LOOP_ALLOC_CHUNK) {
6516                        struct btrfs_trans_handle *trans;
6517
6518                        trans = btrfs_join_transaction(root);
6519                        if (IS_ERR(trans)) {
6520                                ret = PTR_ERR(trans);
6521                                goto out;
6522                        }
6523
6524                        ret = do_chunk_alloc(trans, root, flags,
6525                                             CHUNK_ALLOC_FORCE);
6526                        /*
6527                         * Do not bail out on ENOSPC since we
6528                         * can do more things.
6529                         */
6530                        if (ret < 0 && ret != -ENOSPC)
6531                                btrfs_abort_transaction(trans,
6532                                                        root, ret);
6533                        else
6534                                ret = 0;
6535                        btrfs_end_transaction(trans, root);
6536                        if (ret)
6537                                goto out;
6538                }
6539
6540                if (loop == LOOP_NO_EMPTY_SIZE) {
6541                        empty_size = 0;
6542                        empty_cluster = 0;
6543                }
6544
6545                goto search;
6546        } else if (!ins->objectid) {
6547                ret = -ENOSPC;
6548        } else if (ins->objectid) {
6549                ret = 0;
6550        }
6551out:
6552        if (ret == -ENOSPC)
6553                ins->offset = max_extent_size;
6554        return ret;
6555}
6556
6557static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
6558                            int dump_block_groups)
6559{
6560        struct btrfs_block_group_cache *cache;
6561        int index = 0;
6562
6563        spin_lock(&info->lock);
6564        printk(KERN_INFO "BTRFS: space_info %llu has %llu free, is %sfull\n",
6565               info->flags,
6566               info->total_bytes - info->bytes_used - info->bytes_pinned -
6567               info->bytes_reserved - info->bytes_readonly,
6568               (info->full) ? "" : "not ");
6569        printk(KERN_INFO "BTRFS: space_info total=%llu, used=%llu, pinned=%llu, "
6570               "reserved=%llu, may_use=%llu, readonly=%llu\n",
6571               info->total_bytes, info->bytes_used, info->bytes_pinned,
6572               info->bytes_reserved, info->bytes_may_use,
6573               info->bytes_readonly);
6574        spin_unlock(&info->lock);
6575
6576        if (!dump_block_groups)
6577                return;
6578
6579        down_read(&info->groups_sem);
6580again:
6581        list_for_each_entry(cache, &info->block_groups[index], list) {
6582                spin_lock(&cache->lock);
6583                printk(KERN_INFO "BTRFS: "
6584                           "block group %llu has %llu bytes, "
6585                           "%llu used %llu pinned %llu reserved %s\n",
6586                       cache->key.objectid, cache->key.offset,
6587                       btrfs_block_group_used(&cache->item), cache->pinned,
6588                       cache->reserved, cache->ro ? "[readonly]" : "");
6589                btrfs_dump_free_space(cache, bytes);
6590                spin_unlock(&cache->lock);
6591        }
6592        if (++index < BTRFS_NR_RAID_TYPES)
6593                goto again;
6594        up_read(&info->groups_sem);
6595}
6596
6597int btrfs_reserve_extent(struct btrfs_root *root,
6598                         u64 num_bytes, u64 min_alloc_size,
6599                         u64 empty_size, u64 hint_byte,
6600                         struct btrfs_key *ins, int is_data)
6601{
6602        bool final_tried = false;
6603        u64 flags;
6604        int ret;
6605
6606        flags = btrfs_get_alloc_profile(root, is_data);
6607again:
6608        WARN_ON(num_bytes < root->sectorsize);
6609        ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins,
6610                               flags);
6611
6612        if (ret == -ENOSPC) {
6613                if (!final_tried && ins->offset) {
6614                        num_bytes = min(num_bytes >> 1, ins->offset);
6615                        num_bytes = round_down(num_bytes, root->sectorsize);
6616                        num_bytes = max(num_bytes, min_alloc_size);
6617                        if (num_bytes == min_alloc_size)
6618                                final_tried = true;
6619                        goto again;
6620                } else if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
6621                        struct btrfs_space_info *sinfo;
6622
6623                        sinfo = __find_space_info(root->fs_info, flags);
6624                        btrfs_err(root->fs_info, "allocation failed flags %llu, wanted %llu",
6625                                flags, num_bytes);
6626                        if (sinfo)
6627                                dump_space_info(sinfo, num_bytes, 1);
6628                }
6629        }
6630
6631        return ret;
6632}
6633
6634static int __btrfs_free_reserved_extent(struct btrfs_root *root,
6635                                        u64 start, u64 len, int pin)
6636{
6637        struct btrfs_block_group_cache *cache;
6638        int ret = 0;
6639
6640        cache = btrfs_lookup_block_group(root->fs_info, start);
6641        if (!cache) {
6642                btrfs_err(root->fs_info, "Unable to find block group for %llu",
6643                        start);
6644                return -ENOSPC;
6645        }
6646
6647        if (btrfs_test_opt(root, DISCARD))
6648                ret = btrfs_discard_extent(root, start, len, NULL);
6649
6650        if (pin)
6651                pin_down_extent(root, cache, start, len, 1);
6652        else {
6653                btrfs_add_free_space(cache, start, len);
6654                btrfs_update_reserved_bytes(cache, len, RESERVE_FREE);
6655        }
6656        btrfs_put_block_group(cache);
6657
6658        trace_btrfs_reserved_extent_free(root, start, len);
6659
6660        return ret;
6661}
6662
6663int btrfs_free_reserved_extent(struct btrfs_root *root,
6664                                        u64 start, u64 len)
6665{
6666        return __btrfs_free_reserved_extent(root, start, len, 0);
6667}
6668
6669int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
6670                                       u64 start, u64 len)
6671{
6672        return __btrfs_free_reserved_extent(root, start, len, 1);
6673}
6674
6675static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
6676                                      struct btrfs_root *root,
6677                                      u64 parent, u64 root_objectid,
6678                                      u64 flags, u64 owner, u64 offset,
6679                                      struct btrfs_key *ins, int ref_mod)
6680{
6681        int ret;
6682        struct btrfs_fs_info *fs_info = root->fs_info;
6683        struct btrfs_extent_item *extent_item;
6684        struct btrfs_extent_inline_ref *iref;
6685        struct btrfs_path *path;
6686        struct extent_buffer *leaf;
6687        int type;
6688        u32 size;
6689
6690        if (parent > 0)
6691                type = BTRFS_SHARED_DATA_REF_KEY;
6692        else
6693                type = BTRFS_EXTENT_DATA_REF_KEY;
6694
6695        size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
6696
6697        path = btrfs_alloc_path();
6698        if (!path)
6699                return -ENOMEM;
6700
6701        path->leave_spinning = 1;
6702        ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
6703                                      ins, size);
6704        if (ret) {
6705                btrfs_free_path(path);
6706                return ret;
6707        }
6708
6709        leaf = path->nodes[0];
6710        extent_item = btrfs_item_ptr(leaf, path->slots[0],
6711                                     struct btrfs_extent_item);
6712        btrfs_set_extent_refs(leaf, extent_item, ref_mod);
6713        btrfs_set_extent_generation(leaf, extent_item, trans->transid);
6714        btrfs_set_extent_flags(leaf, extent_item,
6715                               flags | BTRFS_EXTENT_FLAG_DATA);
6716
6717        iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
6718        btrfs_set_extent_inline_ref_type(leaf, iref, type);
6719        if (parent > 0) {
6720                struct btrfs_shared_data_ref *ref;
6721                ref = (struct btrfs_shared_data_ref *)(iref + 1);
6722                btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
6723                btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
6724        } else {
6725                struct btrfs_extent_data_ref *ref;
6726                ref = (struct btrfs_extent_data_ref *)(&iref->offset);
6727                btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
6728                btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
6729                btrfs_set_extent_data_ref_offset(leaf, ref, offset);
6730                btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
6731        }
6732
6733        btrfs_mark_buffer_dirty(path->nodes[0]);
6734        btrfs_free_path(path);
6735
6736        ret = update_block_group(root, ins->objectid, ins->offset, 1);
6737        if (ret) { /* -ENOENT, logic error */
6738                btrfs_err(fs_info, "update block group failed for %llu %llu",
6739                        ins->objectid, ins->offset);
6740                BUG();
6741        }
6742        trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);
6743        return ret;
6744}
6745
6746static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
6747                                     struct btrfs_root *root,
6748                                     u64 parent, u64 root_objectid,
6749                                     u64 flags, struct btrfs_disk_key *key,
6750                                     int level, struct btrfs_key *ins)
6751{
6752        int ret;
6753        struct btrfs_fs_info *fs_info = root->fs_info;
6754        struct btrfs_extent_item *extent_item;
6755        struct btrfs_tree_block_info *block_info;
6756        struct btrfs_extent_inline_ref *iref;
6757        struct btrfs_path *path;
6758        struct extent_buffer *leaf;
6759        u32 size = sizeof(*extent_item) + sizeof(*iref);
6760        bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
6761                                                 SKINNY_METADATA);
6762
6763        if (!skinny_metadata)
6764                size += sizeof(*block_info);
6765
6766        path = btrfs_alloc_path();
6767        if (!path) {
6768                btrfs_free_and_pin_reserved_extent(root, ins->objectid,
6769                                                   root->leafsize);
6770                return -ENOMEM;
6771        }
6772
6773        path->leave_spinning = 1;
6774        ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
6775                                      ins, size);
6776        if (ret) {
6777                btrfs_free_and_pin_reserved_extent(root, ins->objectid,
6778                                                   root->leafsize);
6779                btrfs_free_path(path);
6780                return ret;
6781        }
6782
6783        leaf = path->nodes[0];
6784        extent_item = btrfs_item_ptr(leaf, path->slots[0],
6785                                     struct btrfs_extent_item);
6786        btrfs_set_extent_refs(leaf, extent_item, 1);
6787        btrfs_set_extent_generation(leaf, extent_item, trans->transid);
6788        btrfs_set_extent_flags(leaf, extent_item,
6789                               flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
6790
6791        if (skinny_metadata) {
6792                iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
6793        } else {
6794                block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
6795                btrfs_set_tree_block_key(leaf, block_info, key);
6796                btrfs_set_tree_block_level(leaf, block_info, level);
6797                iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
6798        }
6799
6800        if (parent > 0) {
6801                BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
6802                btrfs_set_extent_inline_ref_type(leaf, iref,
6803                                                 BTRFS_SHARED_BLOCK_REF_KEY);
6804                btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
6805        } else {
6806                btrfs_set_extent_inline_ref_type(leaf, iref,
6807                                                 BTRFS_TREE_BLOCK_REF_KEY);
6808                btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
6809        }
6810
6811        btrfs_mark_buffer_dirty(leaf);
6812        btrfs_free_path(path);
6813
6814        ret = update_block_group(root, ins->objectid, root->leafsize, 1);
6815        if (ret) { /* -ENOENT, logic error */
6816                btrfs_err(fs_info, "update block group failed for %llu %llu",
6817                        ins->objectid, ins->offset);
6818                BUG();
6819        }
6820
6821        trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->leafsize);
6822        return ret;
6823}
6824
6825int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
6826                                     struct btrfs_root *root,
6827                                     u64 root_objectid, u64 owner,
6828                                     u64 offset, struct btrfs_key *ins)
6829{
6830        int ret;
6831
6832        BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
6833
6834        ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
6835                                         ins->offset, 0,
6836                                         root_objectid, owner, offset,
6837                                         BTRFS_ADD_DELAYED_EXTENT, NULL, 0);
6838        return ret;
6839}
6840
6841/*
6842 * this is used by the tree logging recovery code.  It records that
6843 * an extent has been allocated and makes sure to clear the free
6844 * space cache bits as well
6845 */
6846int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
6847                                   struct btrfs_root *root,
6848                                   u64 root_objectid, u64 owner, u64 offset,
6849                                   struct btrfs_key *ins)
6850{
6851        int ret;
6852        struct btrfs_block_group_cache *block_group;
6853
6854        /*
6855         * Mixed block groups will exclude before processing the log so we only
6856         * need to do the exlude dance if this fs isn't mixed.
6857         */
6858        if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) {
6859                ret = __exclude_logged_extent(root, ins->objectid, ins->offset);
6860                if (ret)
6861                        return ret;
6862        }
6863
6864        block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
6865        if (!block_group)
6866                return -EINVAL;
6867
6868        ret = btrfs_update_reserved_bytes(block_group, ins->offset,
6869                                          RESERVE_ALLOC_NO_ACCOUNT);
6870        BUG_ON(ret); /* logic error */
6871        ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
6872                                         0, owner, offset, ins, 1);
6873        btrfs_put_block_group(block_group);
6874        return ret;
6875}
6876
6877static struct extent_buffer *
6878btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
6879                      u64 bytenr, u32 blocksize, int level)
6880{
6881        struct extent_buffer *buf;
6882
6883        buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
6884        if (!buf)
6885                return ERR_PTR(-ENOMEM);
6886        btrfs_set_header_generation(buf, trans->transid);
6887        btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
6888        btrfs_tree_lock(buf);
6889        clean_tree_block(trans, root, buf);
6890        clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
6891
6892        btrfs_set_lock_blocking(buf);
6893        btrfs_set_buffer_uptodate(buf);
6894
6895        if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
6896                /*
6897                 * we allow two log transactions at a time, use different
6898                 * EXENT bit to differentiate dirty pages.
6899                 */
6900                if (root->log_transid % 2 == 0)
6901                        set_extent_dirty(&root->dirty_log_pages, buf->start,
6902                                        buf->start + buf->len - 1, GFP_NOFS);
6903                else
6904                        set_extent_new(&root->dirty_log_pages, buf->start,
6905                                        buf->start + buf->len - 1, GFP_NOFS);
6906        } else {
6907                set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
6908                         buf->start + buf->len - 1, GFP_NOFS);
6909        }
6910        trans->blocks_used++;
6911        /* this returns a buffer locked for blocking */
6912        return buf;
6913}
6914
6915static struct btrfs_block_rsv *
6916use_block_rsv(struct btrfs_trans_handle *trans,
6917              struct btrfs_root *root, u32 blocksize)
6918{
6919        struct btrfs_block_rsv *block_rsv;
6920        struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
6921        int ret;
6922        bool global_updated = false;
6923
6924        block_rsv = get_block_rsv(trans, root);
6925
6926        if (unlikely(block_rsv->size == 0))
6927                goto try_reserve;
6928again:
6929        ret = block_rsv_use_bytes(block_rsv, blocksize);
6930        if (!ret)
6931                return block_rsv;
6932
6933        if (block_rsv->failfast)
6934                return ERR_PTR(ret);
6935
6936        if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
6937                global_updated = true;
6938                update_global_block_rsv(root->fs_info);
6939                goto again;
6940        }
6941
6942        if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
6943                static DEFINE_RATELIMIT_STATE(_rs,
6944                                DEFAULT_RATELIMIT_INTERVAL * 10,
6945                                /*DEFAULT_RATELIMIT_BURST*/ 1);
6946                if (__ratelimit(&_rs))
6947                        WARN(1, KERN_DEBUG
6948                                "BTRFS: block rsv returned %d\n", ret);
6949        }
6950try_reserve:
6951        ret = reserve_metadata_bytes(root, block_rsv, blocksize,
6952                                     BTRFS_RESERVE_NO_FLUSH);
6953        if (!ret)
6954                return block_rsv;
6955        /*
6956         * If we couldn't reserve metadata bytes try and use some from
6957         * the global reserve if its space type is the same as the global
6958         * reservation.
6959         */
6960        if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL &&
6961            block_rsv->space_info == global_rsv->space_info) {
6962                ret = block_rsv_use_bytes(global_rsv, blocksize);
6963                if (!ret)
6964                        return global_rsv;
6965        }
6966        return ERR_PTR(ret);
6967}
6968
6969static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
6970                            struct btrfs_block_rsv *block_rsv, u32 blocksize)
6971{
6972        block_rsv_add_bytes(block_rsv, blocksize, 0);
6973        block_rsv_release_bytes(fs_info, block_rsv, NULL, 0);
6974}
6975
6976/*
6977 * finds a free extent and does all the dirty work required for allocation
6978 * returns the key for the extent through ins, and a tree buffer for
6979 * the first block of the extent through buf.
6980 *
6981 * returns the tree buffer or NULL.
6982 */
6983struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
6984                                        struct btrfs_root *root, u32 blocksize,
6985                                        u64 parent, u64 root_objectid,
6986                                        struct btrfs_disk_key *key, int level,
6987                                        u64 hint, u64 empty_size)
6988{
6989        struct btrfs_key ins;
6990        struct btrfs_block_rsv *block_rsv;
6991        struct extent_buffer *buf;
6992        u64 flags = 0;
6993        int ret;
6994        bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
6995                                                 SKINNY_METADATA);
6996
6997        block_rsv = use_block_rsv(trans, root, blocksize);
6998        if (IS_ERR(block_rsv))
6999                return ERR_CAST(block_rsv);
7000
7001        ret = btrfs_reserve_extent(root, blocksize, blocksize,
7002                                   empty_size, hint, &ins, 0);
7003        if (ret) {
7004                unuse_block_rsv(root->fs_info, block_rsv, blocksize);
7005                return ERR_PTR(ret);
7006        }
7007
7008        buf = btrfs_init_new_buffer(trans, root, ins.objectid,
7009                                    blocksize, level);
7010        BUG_ON(IS_ERR(buf)); /* -ENOMEM */
7011
7012        if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
7013                if (parent == 0)
7014                        parent = ins.objectid;
7015                flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
7016        } else
7017                BUG_ON(parent > 0);
7018
7019        if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
7020                struct btrfs_delayed_extent_op *extent_op;
7021                extent_op = btrfs_alloc_delayed_extent_op();
7022                BUG_ON(!extent_op); /* -ENOMEM */
7023                if (key)
7024                        memcpy(&extent_op->key, key, sizeof(extent_op->key));
7025                else
7026                        memset(&extent_op->key, 0, sizeof(extent_op->key));
7027                extent_op->flags_to_set = flags;
7028                if (skinny_metadata)
7029                        extent_op->update_key = 0;
7030                else
7031                        extent_op->update_key = 1;
7032                extent_op->update_flags = 1;
7033                extent_op->is_data = 0;
7034                extent_op->level = level;
7035
7036                ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
7037                                        ins.objectid,
7038                                        ins.offset, parent, root_objectid,
7039                                        level, BTRFS_ADD_DELAYED_EXTENT,
7040                                        extent_op, 0);
7041                BUG_ON(ret); /* -ENOMEM */
7042        }
7043        return buf;
7044}
7045
7046struct walk_control {
7047        u64 refs[BTRFS_MAX_LEVEL];
7048        u64 flags[BTRFS_MAX_LEVEL];
7049        struct btrfs_key update_progress;
7050        int stage;
7051        int level;
7052        int shared_level;
7053        int update_ref;
7054        int keep_locks;
7055        int reada_slot;
7056        int reada_count;
7057        int for_reloc;
7058};
7059
7060#define DROP_REFERENCE  1
7061#define UPDATE_BACKREF  2
7062
7063static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
7064                                     struct btrfs_root *root,
7065                                     struct walk_control *wc,
7066                                     struct btrfs_path *path)
7067{
7068        u64 bytenr;
7069        u64 generation;
7070        u64 refs;
7071        u64 flags;
7072        u32 nritems;
7073        u32 blocksize;
7074        struct btrfs_key key;
7075        struct extent_buffer *eb;
7076        int ret;
7077        int slot;
7078        int nread = 0;
7079
7080        if (path->slots[wc->level] < wc->reada_slot) {
7081                wc->reada_count = wc->reada_count * 2 / 3;
7082                wc->reada_count = max(wc->reada_count, 2);
7083        } else {
7084                wc->reada_count = wc->reada_count * 3 / 2;
7085                wc->reada_count = min_t(int, wc->reada_count,
7086                                        BTRFS_NODEPTRS_PER_BLOCK(root));
7087        }
7088
7089        eb = path->nodes[wc->level];
7090        nritems = btrfs_header_nritems(eb);
7091        blocksize = btrfs_level_size(root, wc->level - 1);
7092
7093        for (slot = path->slots[wc->level]; slot < nritems; slot++) {
7094                if (nread >= wc->reada_count)
7095                        break;
7096
7097                cond_resched();
7098                bytenr = btrfs_node_blockptr(eb, slot);
7099                generation = btrfs_node_ptr_generation(eb, slot);
7100
7101                if (slot == path->slots[wc->level])
7102                        goto reada;
7103
7104                if (wc->stage == UPDATE_BACKREF &&
7105                    generation <= root->root_key.offset)
7106                        continue;
7107
7108                /* We don't lock the tree block, it's OK to be racy here */
7109                ret = btrfs_lookup_extent_info(trans, root, bytenr,
7110                                               wc->level - 1, 1, &refs,
7111                                               &flags);
7112                /* We don't care about errors in readahead. */
7113                if (ret < 0)
7114                        continue;
7115                BUG_ON(refs == 0);
7116
7117                if (wc->stage == DROP_REFERENCE) {
7118                        if (refs == 1)
7119                                goto reada;
7120
7121                        if (wc->level == 1 &&
7122                            (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
7123                                continue;
7124                        if (!wc->update_ref ||
7125                            generation <= root->root_key.offset)
7126                                continue;
7127                        btrfs_node_key_to_cpu(eb, &key, slot);
7128                        ret = btrfs_comp_cpu_keys(&key,
7129                                                  &wc->update_progress);
7130                        if (ret < 0)
7131                                continue;
7132                } else {
7133                        if (wc->level == 1 &&
7134                            (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
7135                                continue;
7136                }
7137reada:
7138                ret = readahead_tree_block(root, bytenr, blocksize,
7139                                           generation);
7140                if (ret)
7141                        break;
7142                nread++;
7143        }
7144        wc->reada_slot = slot;
7145}
7146
7147/*
7148 * helper to process tree block while walking down the tree.
7149 *
7150 * when wc->stage == UPDATE_BACKREF, this function updates
7151 * back refs for pointers in the block.
7152 *
7153 * NOTE: return value 1 means we should stop walking down.
7154 */
7155static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
7156                                   struct btrfs_root *root,
7157                                   struct btrfs_path *path,
7158                                   struct walk_control *wc, int lookup_info)
7159{
7160        int level = wc->level;
7161        struct extent_buffer *eb = path->nodes[level];
7162        u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
7163        int ret;
7164
7165        if (wc->stage == UPDATE_BACKREF &&
7166            btrfs_header_owner(eb) != root->root_key.objectid)
7167                return 1;
7168
7169        /*
7170         * when reference count of tree block is 1, it won't increase
7171         * again. once full backref flag is set, we never clear it.
7172         */
7173        if (lookup_info &&
7174            ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
7175             (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
7176                BUG_ON(!path->locks[level]);
7177                ret = btrfs_lookup_extent_info(trans, root,
7178                                               eb->start, level, 1,
7179                                               &wc->refs[level],
7180                                               &wc->flags[level]);
7181                BUG_ON(ret == -ENOMEM);
7182                if (ret)
7183                        return ret;
7184                BUG_ON(wc->refs[level] == 0);
7185        }
7186
7187        if (wc->stage == DROP_REFERENCE) {
7188                if (wc->refs[level] > 1)
7189                        return 1;
7190
7191                if (path->locks[level] && !wc->keep_locks) {
7192                        btrfs_tree_unlock_rw(eb, path->locks[level]);
7193                        path->locks[level] = 0;
7194                }
7195                return 0;
7196        }
7197
7198        /* wc->stage == UPDATE_BACKREF */
7199        if (!(wc->flags[level] & flag)) {
7200                BUG_ON(!path->locks[level]);
7201                ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc);
7202                BUG_ON(ret); /* -ENOMEM */
7203                ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc);
7204                BUG_ON(ret); /* -ENOMEM */
7205                ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
7206                                                  eb->len, flag,
7207                                                  btrfs_header_level(eb), 0);
7208                BUG_ON(ret); /* -ENOMEM */
7209                wc->flags[level] |= flag;
7210        }
7211
7212        /*
7213         * the block is shared by multiple trees, so it's not good to
7214         * keep the tree lock
7215         */
7216        if (path->locks[level] && level > 0) {
7217                btrfs_tree_unlock_rw(eb, path->locks[level]);
7218                path->locks[level] = 0;
7219        }
7220        return 0;
7221}
7222
7223/*
7224 * helper to process tree block pointer.
7225 *
7226 * when wc->stage == DROP_REFERENCE, this function checks
7227 * reference count of the block pointed to. if the block
7228 * is shared and we need update back refs for the subtree
7229 * rooted at the block, this function changes wc->stage to
7230 * UPDATE_BACKREF. if the block is shared and there is no
7231 * need to update back, this function drops the reference
7232 * to the block.
7233 *
7234 * NOTE: return value 1 means we should stop walking down.
7235 */
7236static noinline int do_walk_down(struct btrfs_trans_handle *trans,
7237                                 struct btrfs_root *root,
7238                                 struct btrfs_path *path,
7239                                 struct walk_control *wc, int *lookup_info)
7240{
7241        u64 bytenr;
7242        u64 generation;
7243        u64 parent;
7244        u32 blocksize;
7245        struct btrfs_key key;
7246        struct extent_buffer *next;
7247        int level = wc->level;
7248        int reada = 0;
7249        int ret = 0;
7250
7251        generation = btrfs_node_ptr_generation(path->nodes[level],
7252                                               path->slots[level]);
7253        /*
7254         * if the lower level block was created before the snapshot
7255         * was created, we know there is no need to update back refs
7256         * for the subtree
7257         */
7258        if (wc->stage == UPDATE_BACKREF &&
7259            generation <= root->root_key.offset) {
7260                *lookup_info = 1;
7261                return 1;
7262        }
7263
7264        bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
7265        blocksize = btrfs_level_size(root, level - 1);
7266
7267        next = btrfs_find_tree_block(root, bytenr, blocksize);
7268        if (!next) {
7269                next = btrfs_find_create_tree_block(root, bytenr, blocksize);
7270                if (!next)
7271                        return -ENOMEM;
7272                btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
7273                                               level - 1);
7274                reada = 1;
7275        }
7276        btrfs_tree_lock(next);
7277        btrfs_set_lock_blocking(next);
7278
7279        ret = btrfs_lookup_extent_info(trans, root, bytenr, level - 1, 1,
7280                                       &wc->refs[level - 1],
7281                                       &wc->flags[level - 1]);
7282        if (ret < 0) {
7283                btrfs_tree_unlock(next);
7284                return ret;
7285        }
7286
7287        if (unlikely(wc->refs[level - 1] == 0)) {
7288                btrfs_err(root->fs_info, "Missing references.");
7289                BUG();
7290        }
7291        *lookup_info = 0;
7292
7293        if (wc->stage == DROP_REFERENCE) {
7294                if (wc->refs[level - 1] > 1) {
7295                        if (level == 1 &&
7296                            (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
7297                                goto skip;
7298
7299                        if (!wc->update_ref ||
7300                            generation <= root->root_key.offset)
7301                                goto skip;
7302
7303                        btrfs_node_key_to_cpu(path->nodes[level], &key,
7304                                              path->slots[level]);
7305                        ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
7306                        if (ret < 0)
7307                                goto skip;
7308
7309                        wc->stage = UPDATE_BACKREF;
7310                        wc->shared_level = level - 1;
7311                }
7312        } else {
7313                if (level == 1 &&
7314                    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
7315                        goto skip;
7316        }
7317
7318        if (!btrfs_buffer_uptodate(next, generation, 0)) {
7319                btrfs_tree_unlock(next);
7320                free_extent_buffer(next);
7321                next = NULL;
7322                *lookup_info = 1;
7323        }
7324
7325        if (!next) {
7326                if (reada && level == 1)
7327                        reada_walk_down(trans, root, wc, path);
7328                next = read_tree_block(root, bytenr, blocksize, generation);
7329                if (!next || !extent_buffer_uptodate(next)) {
7330                        free_extent_buffer(next);
7331                        return -EIO;
7332                }
7333                btrfs_tree_lock(next);
7334                btrfs_set_lock_blocking(next);
7335        }
7336
7337        level--;
7338        BUG_ON(level != btrfs_header_level(next));
7339        path->nodes[level] = next;
7340        path->slots[level] = 0;
7341        path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
7342        wc->level = level;
7343        if (wc->level == 1)
7344                wc->reada_slot = 0;
7345        return 0;
7346skip:
7347        wc->refs[level - 1] = 0;
7348        wc->flags[level - 1] = 0;
7349        if (wc->stage == DROP_REFERENCE) {
7350                if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
7351                        parent = path->nodes[level]->start;
7352                } else {
7353                        BUG_ON(root->root_key.objectid !=
7354                               btrfs_header_owner(path->nodes[level]));
7355                        parent = 0;
7356                }
7357
7358                ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
7359                                root->root_key.objectid, level - 1, 0, 0);
7360                BUG_ON(ret); /* -ENOMEM */
7361        }
7362        btrfs_tree_unlock(next);
7363        free_extent_buffer(next);
7364        *lookup_info = 1;
7365        return 1;
7366}
7367
7368/*
7369 * helper to process tree block while walking up the tree.
7370 *
7371 * when wc->stage == DROP_REFERENCE, this function drops
7372 * reference count on the block.
7373 *
7374 * when wc->stage == UPDATE_BACKREF, this function changes
7375 * wc->stage back to DROP_REFERENCE if we changed wc->stage
7376 * to UPDATE_BACKREF previously while processing the block.
7377 *
7378 * NOTE: return value 1 means we should stop walking up.
7379 */
7380static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
7381                                 struct btrfs_root *root,
7382                                 struct btrfs_path *path,
7383                                 struct walk_control *wc)
7384{
7385        int ret;
7386        int level = wc->level;
7387        struct extent_buffer *eb = path->nodes[level];
7388        u64 parent = 0;
7389
7390        if (wc->stage == UPDATE_BACKREF) {
7391                BUG_ON(wc->shared_level < level);
7392                if (level < wc->shared_level)
7393                        goto out;
7394
7395                ret = find_next_key(path, level + 1, &wc->update_progress);
7396                if (ret > 0)
7397                        wc->update_ref = 0;
7398
7399                wc->stage = DROP_REFERENCE;
7400                wc->shared_level = -1;
7401                path->slots[level] = 0;
7402
7403                /*
7404                 * check reference count again if the block isn't locked.
7405                 * we should start walking down the tree again if reference
7406                 * count is one.
7407                 */
7408                if (!path->locks[level]) {
7409                        BUG_ON(level == 0);
7410                        btrfs_tree_lock(eb);
7411                        btrfs_set_lock_blocking(eb);
7412                        path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
7413
7414                        ret = btrfs_lookup_extent_info(trans, root,
7415                                                       eb->start, level, 1,
7416                                                       &wc->refs[level],
7417                                                       &wc->flags[level]);
7418                        if (ret < 0) {
7419                                btrfs_tree_unlock_rw(eb, path->locks[level]);
7420                                path->locks[level] = 0;
7421                                return ret;
7422                        }
7423                        BUG_ON(wc->refs[level] == 0);
7424                        if (wc->refs[level] == 1) {
7425                                btrfs_tree_unlock_rw(eb, path->locks[level]);
7426                                path->locks[level] = 0;
7427                                return 1;
7428                        }
7429                }
7430        }
7431
7432        /* wc->stage == DROP_REFERENCE */
7433        BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
7434
7435        if (wc->refs[level] == 1) {
7436                if (level == 0) {
7437                        if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
7438                                ret = btrfs_dec_ref(trans, root, eb, 1,
7439                                                    wc->for_reloc);
7440                        else
7441                                ret = btrfs_dec_ref(trans, root, eb, 0,
7442                                                    wc->for_reloc);
7443                        BUG_ON(ret); /* -ENOMEM */
7444                }
7445                /* make block locked assertion in clean_tree_block happy */
7446                if (!path->locks[level] &&
7447                    btrfs_header_generation(eb) == trans->transid) {
7448                        btrfs_tree_lock(eb);
7449                        btrfs_set_lock_blocking(eb);
7450                        path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
7451                }
7452                clean_tree_block(trans, root, eb);
7453        }
7454
7455        if (eb == root->node) {
7456                if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
7457                        parent = eb->start;
7458                else
7459                        BUG_ON(root->root_key.objectid !=
7460                               btrfs_header_owner(eb));
7461        } else {
7462                if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
7463                        parent = path->nodes[level + 1]->start;
7464                else
7465                        BUG_ON(root->root_key.objectid !=
7466                               btrfs_header_owner(path->nodes[level + 1]));
7467        }
7468
7469        btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
7470out:
7471        wc->refs[level] = 0;
7472        wc->flags[level] = 0;
7473        return 0;
7474}
7475
7476static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
7477                                   struct btrfs_root *root,
7478                                   struct btrfs_path *path,
7479                                   struct walk_control *wc)
7480{
7481        int level = wc->level;
7482        int lookup_info = 1;
7483        int ret;
7484
7485        while (level >= 0) {
7486                ret = walk_down_proc(trans, root, path, wc, lookup_info);
7487                if (ret > 0)
7488                        break;
7489
7490                if (level == 0)
7491                        break;
7492
7493                if (path->slots[level] >=
7494                    btrfs_header_nritems(path->nodes[level]))
7495                        break;
7496
7497                ret = do_walk_down(trans, root, path, wc, &lookup_info);
7498                if (ret > 0) {
7499                        path->slots[level]++;
7500                        continue;
7501                } else if (ret < 0)
7502                        return ret;
7503                level = wc->level;
7504        }
7505        return 0;
7506}
7507
7508static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
7509                                 struct btrfs_root *root,
7510                                 struct btrfs_path *path,
7511                                 struct walk_control *wc, int max_level)
7512{
7513        int level = wc->level;
7514        int ret;
7515
7516        path->slots[level] = btrfs_header_nritems(path->nodes[level]);
7517        while (level < max_level && path->nodes[level]) {
7518                wc->level = level;
7519                if (path->slots[level] + 1 <
7520                    btrfs_header_nritems(path->nodes[level])) {
7521                        path->slots[level]++;
7522                        return 0;
7523                } else {
7524                        ret = walk_up_proc(trans, root, path, wc);
7525                        if (ret > 0)
7526                                return 0;
7527
7528                        if (path->locks[level]) {
7529                                btrfs_tree_unlock_rw(path->nodes[level],
7530                                                     path->locks[level]);
7531                                path->locks[level] = 0;
7532                        }
7533                        free_extent_buffer(path->nodes[level]);
7534                        path->nodes[level] = NULL;
7535                        level++;
7536                }
7537        }
7538        return 1;
7539}
7540
7541/*
7542 * drop a subvolume tree.
7543 *
7544 * this function traverses the tree freeing any blocks that only
7545 * referenced by the tree.
7546 *
7547 * when a shared tree block is found. this function decreases its
7548 * reference count by one. if update_ref is true, this function
7549 * also make sure backrefs for the shared block and all lower level
7550 * blocks are properly updated.
7551 *
7552 * If called with for_reloc == 0, may exit early with -EAGAIN
7553 */
7554int btrfs_drop_snapshot(struct btrfs_root *root,
7555                         struct btrfs_block_rsv *block_rsv, int update_ref,
7556                         int for_reloc)
7557{
7558        struct btrfs_path *path;
7559        struct btrfs_trans_handle *trans;
7560        struct btrfs_root *tree_root = root->fs_info->tree_root;
7561        struct btrfs_root_item *root_item = &root->root_item;
7562        struct walk_control *wc;
7563        struct btrfs_key key;
7564        int err = 0;
7565        int ret;
7566        int level;
7567        bool root_dropped = false;
7568
7569        path = btrfs_alloc_path();
7570        if (!path) {
7571                err = -ENOMEM;
7572                goto out;
7573        }
7574
7575        wc = kzalloc(sizeof(*wc), GFP_NOFS);
7576        if (!wc) {
7577                btrfs_free_path(path);
7578                err = -ENOMEM;
7579                goto out;
7580        }
7581
7582        trans = btrfs_start_transaction(tree_root, 0);
7583        if (IS_ERR(trans)) {
7584                err = PTR_ERR(trans);
7585                goto out_free;
7586        }
7587
7588        if (block_rsv)
7589                trans->block_rsv = block_rsv;
7590
7591        if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
7592                level = btrfs_header_level(root->node);
7593                path->nodes[level] = btrfs_lock_root_node(root);
7594                btrfs_set_lock_blocking(path->nodes[level]);
7595                path->slots[level] = 0;
7596                path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
7597                memset(&wc->update_progress, 0,
7598                       sizeof(wc->update_progress));
7599        } else {
7600                btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
7601                memcpy(&wc->update_progress, &key,
7602                       sizeof(wc->update_progress));
7603
7604                level = root_item->drop_level;
7605                BUG_ON(level == 0);
7606                path->lowest_level = level;
7607                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
7608                path->lowest_level = 0;
7609                if (ret < 0) {
7610                        err = ret;
7611                        goto out_end_trans;
7612                }
7613                WARN_ON(ret > 0);
7614
7615                /*
7616                 * unlock our path, this is safe because only this
7617                 * function is allowed to delete this snapshot
7618                 */
7619                btrfs_unlock_up_safe(path, 0);
7620
7621                level = btrfs_header_level(root->node);
7622                while (1) {
7623                        btrfs_tree_lock(path->nodes[level]);
7624                        btrfs_set_lock_blocking(path->nodes[level]);
7625                        path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
7626
7627                        ret = btrfs_lookup_extent_info(trans, root,
7628                                                path->nodes[level]->start,
7629                                                level, 1, &wc->refs[level],
7630                                                &wc->flags[level]);
7631                        if (ret < 0) {
7632                                err = ret;
7633                                goto out_end_trans;
7634                        }
7635                        BUG_ON(wc->refs[level] == 0);
7636
7637                        if (level == root_item->drop_level)
7638                                break;
7639
7640                        btrfs_tree_unlock(path->nodes[level]);
7641                        path->locks[level] = 0;
7642                        WARN_ON(wc->refs[level] != 1);
7643                        level--;
7644                }
7645        }
7646
7647        wc->level = level;
7648        wc->shared_level = -1;
7649        wc->stage = DROP_REFERENCE;
7650        wc->update_ref = update_ref;
7651        wc->keep_locks = 0;
7652        wc->for_reloc = for_reloc;
7653        wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
7654
7655        while (1) {
7656
7657                ret = walk_down_tree(trans, root, path, wc);
7658                if (ret < 0) {
7659                        err = ret;
7660                        break;
7661                }
7662
7663                ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
7664                if (ret < 0) {
7665                        err = ret;
7666                        break;
7667                }
7668
7669                if (ret > 0) {
7670                        BUG_ON(wc->stage != DROP_REFERENCE);
7671                        break;
7672                }
7673
7674                if (wc->stage == DROP_REFERENCE) {
7675                        level = wc->level;
7676                        btrfs_node_key(path->nodes[level],
7677                                       &root_item->drop_progress,
7678                                       path->slots[level]);
7679                        root_item->drop_level = level;
7680                }
7681
7682                BUG_ON(wc->level == 0);
7683                if (btrfs_should_end_transaction(trans, tree_root) ||
7684                    (!for_reloc && btrfs_need_cleaner_sleep(root))) {
7685                        ret = btrfs_update_root(trans, tree_root,
7686                                                &root->root_key,
7687                                                root_item);
7688                        if (ret) {
7689                                btrfs_abort_transaction(trans, tree_root, ret);
7690                                err = ret;
7691                                goto out_end_trans;
7692                        }
7693
7694                        btrfs_end_transaction_throttle(trans, tree_root);
7695                        if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
7696                                pr_debug("BTRFS: drop snapshot early exit\n");
7697                                err = -EAGAIN;
7698                                goto out_free;
7699                        }
7700
7701                        trans = btrfs_start_transaction(tree_root, 0);
7702                        if (IS_ERR(trans)) {
7703                                err = PTR_ERR(trans);
7704                                goto out_free;
7705                        }
7706                        if (block_rsv)
7707                                trans->block_rsv = block_rsv;
7708                }
7709        }
7710        btrfs_release_path(path);
7711        if (err)
7712                goto out_end_trans;
7713
7714        ret = btrfs_del_root(trans, tree_root, &root->root_key);
7715        if (ret) {
7716                btrfs_abort_transaction(trans, tree_root, ret);
7717                goto out_end_trans;
7718        }
7719
7720        if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
7721                ret = btrfs_find_root(tree_root, &root->root_key, path,
7722                                      NULL, NULL);
7723                if (ret < 0) {
7724                        btrfs_abort_transaction(trans, tree_root, ret);
7725                        err = ret;
7726                        goto out_end_trans;
7727                } else if (ret > 0) {
7728                        /* if we fail to delete the orphan item this time
7729                         * around, it'll get picked up the next time.
7730                         *
7731                         * The most common failure here is just -ENOENT.
7732                         */
7733                        btrfs_del_orphan_item(trans, tree_root,
7734                                              root->root_key.objectid);
7735                }
7736        }
7737
7738        if (root->in_radix) {
7739                btrfs_drop_and_free_fs_root(tree_root->fs_info, root);
7740        } else {
7741                free_extent_buffer(root->node);
7742                free_extent_buffer(root->commit_root);
7743                btrfs_put_fs_root(root);
7744        }
7745        root_dropped = true;
7746out_end_trans:
7747        btrfs_end_transaction_throttle(trans, tree_root);
7748out_free:
7749        kfree(wc);
7750        btrfs_free_path(path);
7751out:
7752        /*
7753         * So if we need to stop dropping the snapshot for whatever reason we
7754         * need to make sure to add it back to the dead root list so that we
7755         * keep trying to do the work later.  This also cleans up roots if we
7756         * don't have it in the radix (like when we recover after a power fail
7757         * or unmount) so we don't leak memory.
7758         */
7759        if (!for_reloc && root_dropped == false)
7760                btrfs_add_dead_root(root);
7761        if (err && err != -EAGAIN)
7762                btrfs_std_error(root->fs_info, err);
7763        return err;
7764}
7765
7766/*
7767 * drop subtree rooted at tree block 'node'.
7768 *
7769 * NOTE: this function will unlock and release tree block 'node'
7770 * only used by relocation code
7771 */
7772int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
7773                        struct btrfs_root *root,
7774                        struct extent_buffer *node,
7775                        struct extent_buffer *parent)
7776{
7777        struct btrfs_path *path;
7778        struct walk_control *wc;
7779        int level;
7780        int parent_level;
7781        int ret = 0;
7782        int wret;
7783
7784        BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
7785
7786        path = btrfs_alloc_path();
7787        if (!path)
7788                return -ENOMEM;
7789
7790        wc = kzalloc(sizeof(*wc), GFP_NOFS);
7791        if (!wc) {
7792                btrfs_free_path(path);
7793                return -ENOMEM;
7794        }
7795
7796        btrfs_assert_tree_locked(parent);
7797        parent_level = btrfs_header_level(parent);
7798        extent_buffer_get(parent);
7799        path->nodes[parent_level] = parent;
7800        path->slots[parent_level] = btrfs_header_nritems(parent);
7801
7802        btrfs_assert_tree_locked(node);
7803        level = btrfs_header_level(node);
7804        path->nodes[level] = node;
7805        path->slots[level] = 0;
7806        path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
7807
7808        wc->refs[parent_level] = 1;
7809        wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
7810        wc->level = level;
7811        wc->shared_level = -1;
7812        wc->stage = DROP_REFERENCE;
7813        wc->update_ref = 0;
7814        wc->keep_locks = 1;
7815        wc->for_reloc = 1;
7816        wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
7817
7818        while (1) {
7819                wret = walk_down_tree(trans, root, path, wc);
7820                if (wret < 0) {
7821                        ret = wret;
7822                        break;
7823                }
7824
7825                wret = walk_up_tree(trans, root, path, wc, parent_level);
7826                if (wret < 0)
7827                        ret = wret;
7828                if (wret != 0)
7829                        break;
7830        }
7831
7832        kfree(wc);
7833        btrfs_free_path(path);
7834        return ret;
7835}
7836
7837static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
7838{
7839        u64 num_devices;
7840        u64 stripped;
7841
7842        /*
7843         * if restripe for this chunk_type is on pick target profile and
7844         * return, otherwise do the usual balance
7845         */
7846        stripped = get_restripe_target(root->fs_info, flags);
7847        if (stripped)
7848                return extended_to_chunk(stripped);
7849
7850        /*
7851         * we add in the count of missing devices because we want
7852         * to make sure that any RAID levels on a degraded FS
7853         * continue to be honored.
7854         */
7855        num_devices = root->fs_info->fs_devices->rw_devices +
7856                root->fs_info->fs_devices->missing_devices;
7857
7858        stripped = BTRFS_BLOCK_GROUP_RAID0 |
7859                BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
7860                BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
7861
7862        if (num_devices == 1) {
7863                stripped |= BTRFS_BLOCK_GROUP_DUP;
7864                stripped = flags & ~stripped;
7865
7866                /* turn raid0 into single device chunks */
7867                if (flags & BTRFS_BLOCK_GROUP_RAID0)
7868                        return stripped;
7869
7870                /* turn mirroring into duplication */
7871                if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
7872                             BTRFS_BLOCK_GROUP_RAID10))
7873                        return stripped | BTRFS_BLOCK_GROUP_DUP;
7874        } else {
7875                /* they already had raid on here, just return */
7876                if (flags & stripped)
7877                        return flags;
7878
7879                stripped |= BTRFS_BLOCK_GROUP_DUP;
7880                stripped = flags & ~stripped;
7881
7882                /* switch duplicated blocks with raid1 */
7883                if (flags & BTRFS_BLOCK_GROUP_DUP)
7884                        return stripped | BTRFS_BLOCK_GROUP_RAID1;
7885
7886                /* this is drive concat, leave it alone */
7887        }
7888
7889        return flags;
7890}
7891
7892static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
7893{
7894        struct btrfs_space_info *sinfo = cache->space_info;
7895        u64 num_bytes;
7896        u64 min_allocable_bytes;
7897        int ret = -ENOSPC;
7898
7899
7900        /*
7901         * We need some metadata space and system metadata space for
7902         * allocating chunks in some corner cases until we force to set
7903         * it to be readonly.
7904         */
7905        if ((sinfo->flags &
7906             (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
7907            !force)
7908                min_allocable_bytes = 1 * 1024 * 1024;
7909        else
7910                min_allocable_bytes = 0;
7911
7912        spin_lock(&sinfo->lock);
7913        spin_lock(&cache->lock);
7914
7915        if (cache->ro) {
7916                ret = 0;
7917                goto out;
7918        }
7919
7920        num_bytes = cache->key.offset - cache->reserved - cache->pinned -
7921                    cache->bytes_super - btrfs_block_group_used(&cache->item);
7922
7923        if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
7924            sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
7925            min_allocable_bytes <= sinfo->total_bytes) {
7926                sinfo->bytes_readonly += num_bytes;
7927                cache->ro = 1;
7928                ret = 0;
7929        }
7930out:
7931        spin_unlock(&cache->lock);
7932        spin_unlock(&sinfo->lock);
7933        return ret;
7934}
7935
7936int btrfs_set_block_group_ro(struct btrfs_root *root,
7937                             struct btrfs_block_group_cache *cache)
7938
7939{
7940        struct btrfs_trans_handle *trans;
7941        u64 alloc_flags;
7942        int ret;
7943
7944        BUG_ON(cache->ro);
7945
7946        trans = btrfs_join_transaction(root);
7947        if (IS_ERR(trans))
7948                return PTR_ERR(trans);
7949
7950        alloc_flags = update_block_group_flags(root, cache->flags);
7951        if (alloc_flags != cache->flags) {
7952                ret = do_chunk_alloc(trans, root, alloc_flags,
7953                                     CHUNK_ALLOC_FORCE);
7954                if (ret < 0)
7955                        goto out;
7956        }
7957
7958        ret = set_block_group_ro(cache, 0);
7959        if (!ret)
7960                goto out;
7961        alloc_flags = get_alloc_profile(root, cache->space_info->flags);
7962        ret = do_chunk_alloc(trans, root, alloc_flags,
7963                             CHUNK_ALLOC_FORCE);
7964        if (ret < 0)
7965                goto out;
7966        ret = set_block_group_ro(cache, 0);
7967out:
7968        btrfs_end_transaction(trans, root);
7969        return ret;
7970}
7971
7972int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
7973                            struct btrfs_root *root, u64 type)
7974{
7975        u64 alloc_flags = get_alloc_profile(root, type);
7976        return do_chunk_alloc(trans, root, alloc_flags,
7977                              CHUNK_ALLOC_FORCE);
7978}
7979
7980/*
7981 * helper to account the unused space of all the readonly block group in the
7982 * list. takes mirrors into account.
7983 */
7984static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
7985{
7986        struct btrfs_block_group_cache *block_group;
7987        u64 free_bytes = 0;
7988        int factor;
7989
7990        list_for_each_entry(block_group, groups_list, list) {
7991                spin_lock(&block_group->lock);
7992
7993                if (!block_group->ro) {
7994                        spin_unlock(&block_group->lock);
7995                        continue;
7996                }
7997
7998                if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 |
7999                                          BTRFS_BLOCK_GROUP_RAID10 |
8000                                          BTRFS_BLOCK_GROUP_DUP))
8001                        factor = 2;
8002                else
8003                        factor = 1;
8004
8005                free_bytes += (block_group->key.offset -
8006                               btrfs_block_group_used(&block_group->item)) *
8007                               factor;
8008
8009                spin_unlock(&block_group->lock);
8010        }
8011
8012        return free_bytes;
8013}
8014
8015/*
8016 * helper to account the unused space of all the readonly block group in the
8017 * space_info. takes mirrors into account.
8018 */
8019u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
8020{
8021        int i;
8022        u64 free_bytes = 0;
8023
8024        spin_lock(&sinfo->lock);
8025
8026        for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
8027                if (!list_empty(&sinfo->block_groups[i]))
8028                        free_bytes += __btrfs_get_ro_block_group_free_space(
8029                                                &sinfo->block_groups[i]);
8030
8031        spin_unlock(&sinfo->lock);
8032
8033        return free_bytes;
8034}
8035
8036void btrfs_set_block_group_rw(struct btrfs_root *root,
8037                              struct btrfs_block_group_cache *cache)
8038{
8039        struct btrfs_space_info *sinfo = cache->space_info;
8040        u64 num_bytes;
8041
8042        BUG_ON(!cache->ro);
8043
8044        spin_lock(&sinfo->lock);
8045        spin_lock(&cache->lock);
8046        num_bytes = cache->key.offset - cache->reserved - cache->pinned -
8047                    cache->bytes_super - btrfs_block_group_used(&cache->item);
8048        sinfo->bytes_readonly -= num_bytes;
8049        cache->ro = 0;
8050        spin_unlock(&cache->lock);
8051        spin_unlock(&sinfo->lock);
8052}
8053
8054/*
8055 * checks to see if its even possible to relocate this block group.
8056 *
8057 * @return - -1 if it's not a good idea to relocate this block group, 0 if its
8058 * ok to go ahead and try.
8059 */
8060int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
8061{
8062        struct btrfs_block_group_cache *block_group;
8063        struct btrfs_space_info *space_info;
8064        struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
8065        struct btrfs_device *device;
8066        struct btrfs_trans_handle *trans;
8067        u64 min_free;
8068        u64 dev_min = 1;
8069        u64 dev_nr = 0;
8070        u64 target;
8071        int index;
8072        int full = 0;
8073        int ret = 0;
8074
8075        block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
8076
8077        /* odd, couldn't find the block group, leave it alone */
8078        if (!block_group)
8079                return -1;
8080
8081        min_free = btrfs_block_group_used(&block_group->item);
8082
8083        /* no bytes used, we're good */
8084        if (!min_free)
8085                goto out;
8086
8087        space_info = block_group->space_info;
8088        spin_lock(&space_info->lock);
8089
8090        full = space_info->full;
8091
8092        /*
8093         * if this is the last block group we have in this space, we can't
8094         * relocate it unless we're able to allocate a new chunk below.
8095         *
8096         * Otherwise, we need to make sure we have room in the space to handle
8097         * all of the extents from this block group.  If we can, we're good
8098         */
8099        if ((space_info->total_bytes != block_group->key.offset) &&
8100            (space_info->bytes_used + space_info->bytes_reserved +
8101             space_info->bytes_pinned + space_info->bytes_readonly +
8102             min_free < space_info->total_bytes)) {
8103                spin_unlock(&space_info->lock);
8104                goto out;
8105        }
8106        spin_unlock(&space_info->lock);
8107
8108        /*
8109         * ok we don't have enough space, but maybe we have free space on our
8110         * devices to allocate new chunks for relocation, so loop through our
8111         * alloc devices and guess if we have enough space.  if this block
8112         * group is going to be restriped, run checks against the target
8113         * profile instead of the current one.
8114         */
8115        ret = -1;
8116
8117        /*
8118         * index:
8119         *      0: raid10
8120         *      1: raid1
8121         *      2: dup
8122         *      3: raid0
8123         *      4: single
8124         */
8125        target = get_restripe_target(root->fs_info, block_group->flags);
8126        if (target) {
8127                index = __get_raid_index(extended_to_chunk(target));
8128        } else {
8129                /*
8130                 * this is just a balance, so if we were marked as full
8131                 * we know there is no space for a new chunk
8132                 */
8133                if (full)
8134                        goto out;
8135
8136                index = get_block_group_index(block_group);
8137        }
8138
8139        if (index == BTRFS_RAID_RAID10) {
8140                dev_min = 4;
8141                /* Divide by 2 */
8142                min_free >>= 1;
8143        } else if (index == BTRFS_RAID_RAID1) {
8144                dev_min = 2;
8145        } else if (index == BTRFS_RAID_DUP) {
8146                /* Multiply by 2 */
8147                min_free <<= 1;
8148        } else if (index == BTRFS_RAID_RAID0) {
8149                dev_min = fs_devices->rw_devices;
8150                do_div(min_free, dev_min);
8151        }
8152
8153        /* We need to do this so that we can look at pending chunks */
8154        trans = btrfs_join_transaction(root);
8155        if (IS_ERR(trans)) {
8156                ret = PTR_ERR(trans);
8157                goto out;
8158        }
8159
8160        mutex_lock(&root->fs_info->chunk_mutex);
8161        list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
8162                u64 dev_offset;
8163
8164                /*
8165                 * check to make sure we can actually find a chunk with enough
8166                 * space to fit our block group in.
8167                 */
8168                if (device->total_bytes > device->bytes_used + min_free &&
8169                    !device->is_tgtdev_for_dev_replace) {
8170                        ret = find_free_dev_extent(trans, device, min_free,
8171                                                   &dev_offset, NULL);
8172                        if (!ret)
8173                                dev_nr++;
8174
8175                        if (dev_nr >= dev_min)
8176                                break;
8177
8178                        ret = -1;
8179                }
8180        }
8181        mutex_unlock(&root->fs_info->chunk_mutex);
8182        btrfs_end_transaction(trans, root);
8183out:
8184        btrfs_put_block_group(block_group);
8185        return ret;
8186}
8187
8188static int find_first_block_group(struct btrfs_root *root,
8189                struct btrfs_path *path, struct btrfs_key *key)
8190{
8191        int ret = 0;
8192        struct btrfs_key found_key;
8193        struct extent_buffer *leaf;
8194        int slot;
8195
8196        ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
8197        if (ret < 0)
8198                goto out;
8199
8200        while (1) {
8201                slot = path->slots[0];
8202                leaf = path->nodes[0];
8203                if (slot >= btrfs_header_nritems(leaf)) {
8204                        ret = btrfs_next_leaf(root, path);
8205                        if (ret == 0)
8206                                continue;
8207                        if (ret < 0)
8208                                goto out;
8209                        break;
8210                }
8211                btrfs_item_key_to_cpu(leaf, &found_key, slot);
8212
8213                if (found_key.objectid >= key->objectid &&
8214                    found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
8215                        ret = 0;
8216                        goto out;
8217                }
8218                path->slots[0]++;
8219        }
8220out:
8221        return ret;
8222}
8223
8224void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
8225{
8226        struct btrfs_block_group_cache *block_group;
8227        u64 last = 0;
8228
8229        while (1) {
8230                struct inode *inode;
8231
8232                block_group = btrfs_lookup_first_block_group(info, last);
8233                while (block_group) {
8234                        spin_lock(&block_group->lock);
8235                        if (block_group->iref)
8236                                break;
8237                        spin_unlock(&block_group->lock);
8238                        block_group = next_block_group(info->tree_root,
8239                                                       block_group);
8240                }
8241                if (!block_group) {
8242                        if (last == 0)
8243                                break;
8244                        last = 0;
8245                        continue;
8246                }
8247
8248                inode = block_group->inode;
8249                block_group->iref = 0;
8250                block_group->inode = NULL;
8251                spin_unlock(&block_group->lock);
8252                iput(inode);
8253                last = block_group->key.objectid + block_group->key.offset;
8254                btrfs_put_block_group(block_group);
8255        }
8256}
8257
8258int btrfs_free_block_groups(struct btrfs_fs_info *info)
8259{
8260        struct btrfs_block_group_cache *block_group;
8261        struct btrfs_space_info *space_info;
8262        struct btrfs_caching_control *caching_ctl;
8263        struct rb_node *n;
8264
8265        down_write(&info->commit_root_sem);
8266        while (!list_empty(&info->caching_block_groups)) {
8267                caching_ctl = list_entry(info->caching_block_groups.next,
8268                                         struct btrfs_caching_control, list);
8269                list_del(&caching_ctl->list);
8270                put_caching_control(caching_ctl);
8271        }
8272        up_write(&info->commit_root_sem);
8273
8274        spin_lock(&info->block_group_cache_lock);
8275        while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
8276                block_group = rb_entry(n, struct btrfs_block_group_cache,
8277                                       cache_node);
8278                rb_erase(&block_group->cache_node,
8279                         &info->block_group_cache_tree);
8280                spin_unlock(&info->block_group_cache_lock);
8281
8282                down_write(&block_group->space_info->groups_sem);
8283                list_del(&block_group->list);
8284                up_write(&block_group->space_info->groups_sem);
8285
8286                if (block_group->cached == BTRFS_CACHE_STARTED)
8287                        wait_block_group_cache_done(block_group);
8288
8289                /*
8290                 * We haven't cached this block group, which means we could
8291                 * possibly have excluded extents on this block group.
8292                 */
8293                if (block_group->cached == BTRFS_CACHE_NO ||
8294                    block_group->cached == BTRFS_CACHE_ERROR)
8295                        free_excluded_extents(info->extent_root, block_group);
8296
8297                btrfs_remove_free_space_cache(block_group);
8298                btrfs_put_block_group(block_group);
8299
8300                spin_lock(&info->block_group_cache_lock);
8301        }
8302        spin_unlock(&info->block_group_cache_lock);
8303
8304        /* now that all the block groups are freed, go through and
8305         * free all the space_info structs.  This is only called during
8306         * the final stages of unmount, and so we know nobody is
8307         * using them.  We call synchronize_rcu() once before we start,
8308         * just to be on the safe side.
8309         */
8310        synchronize_rcu();
8311
8312        release_global_block_rsv(info);
8313
8314        while (!list_empty(&info->space_info)) {
8315                int i;
8316
8317                space_info = list_entry(info->space_info.next,
8318                                        struct btrfs_space_info,
8319                                        list);
8320                if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) {
8321                        if (WARN_ON(space_info->bytes_pinned > 0 ||
8322                            space_info->bytes_reserved > 0 ||
8323                            space_info->bytes_may_use > 0)) {
8324                                dump_space_info(space_info, 0, 0);
8325                        }
8326                }
8327                list_del(&space_info->list);
8328                for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
8329                        struct kobject *kobj;
8330                        kobj = &space_info->block_group_kobjs[i];
8331                        if (kobj->parent) {
8332                                kobject_del(kobj);
8333                                kobject_put(kobj);
8334                        }
8335                }
8336                kobject_del(&space_info->kobj);
8337                kobject_put(&space_info->kobj);
8338        }
8339        return 0;
8340}
8341
8342static void __link_block_group(struct btrfs_space_info *space_info,
8343                               struct btrfs_block_group_cache *cache)
8344{
8345        int index = get_block_group_index(cache);
8346        bool first = false;
8347
8348        down_write(&space_info->groups_sem);
8349        if (list_empty(&space_info->block_groups[index]))
8350                first = true;
8351        list_add_tail(&cache->list, &space_info->block_groups[index]);
8352        up_write(&space_info->groups_sem);
8353
8354        if (first) {
8355                struct kobject *kobj = &space_info->block_group_kobjs[index];
8356                int ret;
8357
8358                kobject_get(&space_info->kobj); /* put in release */
8359                ret = kobject_add(kobj, &space_info->kobj, "%s",
8360                                  get_raid_name(index));
8361                if (ret) {
8362                        pr_warn("BTRFS: failed to add kobject for block cache. ignoring.\n");
8363                        kobject_put(&space_info->kobj);
8364                }
8365        }
8366}
8367
8368static struct btrfs_block_group_cache *
8369btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
8370{
8371        struct btrfs_block_group_cache *cache;
8372
8373        cache = kzalloc(sizeof(*cache), GFP_NOFS);
8374        if (!cache)
8375                return NULL;
8376
8377        cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
8378                                        GFP_NOFS);
8379        if (!cache->free_space_ctl) {
8380                kfree(cache);
8381                return NULL;
8382        }
8383
8384        cache->key.objectid = start;
8385        cache->key.offset = size;
8386        cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
8387
8388        cache->sectorsize = root->sectorsize;
8389        cache->fs_info = root->fs_info;
8390        cache->full_stripe_len = btrfs_full_stripe_len(root,
8391                                               &root->fs_info->mapping_tree,
8392                                               start);
8393        atomic_set(&cache->count, 1);
8394        spin_lock_init(&cache->lock);
8395        INIT_LIST_HEAD(&cache->list);
8396        INIT_LIST_HEAD(&cache->cluster_list);
8397        INIT_LIST_HEAD(&cache->new_bg_list);
8398        btrfs_init_free_space_ctl(cache);
8399
8400        return cache;
8401}
8402
8403int btrfs_read_block_groups(struct btrfs_root *root)
8404{
8405        struct btrfs_path *path;
8406        int ret;
8407        struct btrfs_block_group_cache *cache;
8408        struct btrfs_fs_info *info = root->fs_info;
8409        struct btrfs_space_info *space_info;
8410        struct btrfs_key key;
8411        struct btrfs_key found_key;
8412        struct extent_buffer *leaf;
8413        int need_clear = 0;
8414        u64 cache_gen;
8415
8416        root = info->extent_root;
8417        key.objectid = 0;
8418        key.offset = 0;
8419        btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
8420        path = btrfs_alloc_path();
8421        if (!path)
8422                return -ENOMEM;
8423        path->reada = 1;
8424
8425        cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
8426        if (btrfs_test_opt(root, SPACE_CACHE) &&
8427            btrfs_super_generation(root->fs_info->super_copy) != cache_gen)
8428                need_clear = 1;
8429        if (btrfs_test_opt(root, CLEAR_CACHE))
8430                need_clear = 1;
8431
8432        while (1) {
8433                ret = find_first_block_group(root, path, &key);
8434                if (ret > 0)
8435                        break;
8436                if (ret != 0)
8437                        goto error;
8438
8439                leaf = path->nodes[0];
8440                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
8441
8442                cache = btrfs_create_block_group_cache(root, found_key.objectid,
8443                                                       found_key.offset);
8444                if (!cache) {
8445                        ret = -ENOMEM;
8446                        goto error;
8447                }
8448
8449                if (need_clear) {
8450                        /*
8451                         * When we mount with old space cache, we need to
8452                         * set BTRFS_DC_CLEAR and set dirty flag.
8453                         *
8454                         * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
8455                         *    truncate the old free space cache inode and
8456                         *    setup a new one.
8457                         * b) Setting 'dirty flag' makes sure that we flush
8458                         *    the new space cache info onto disk.
8459                         */
8460                        cache->disk_cache_state = BTRFS_DC_CLEAR;
8461                        if (btrfs_test_opt(root, SPACE_CACHE))
8462                                cache->dirty = 1;
8463                }
8464
8465                read_extent_buffer(leaf, &cache->item,
8466                                   btrfs_item_ptr_offset(leaf, path->slots[0]),
8467                                   sizeof(cache->item));
8468                cache->flags = btrfs_block_group_flags(&cache->item);
8469
8470                key.objectid = found_key.objectid + found_key.offset;
8471                btrfs_release_path(path);
8472
8473                /*
8474                 * We need to exclude the super stripes now so that the space
8475                 * info has super bytes accounted for, otherwise we'll think
8476                 * we have more space than we actually do.
8477                 */
8478                ret = exclude_super_stripes(root, cache);
8479                if (ret) {
8480                        /*
8481                         * We may have excluded something, so call this just in
8482                         * case.
8483                         */
8484                        free_excluded_extents(root, cache);
8485                        btrfs_put_block_group(cache);
8486                        goto error;
8487                }
8488
8489                /*
8490                 * check for two cases, either we are full, and therefore
8491                 * don't need to bother with the caching work since we won't
8492                 * find any space, or we are empty, and we can just add all
8493                 * the space in and be done with it.  This saves us _alot_ of
8494                 * time, particularly in the full case.
8495                 */
8496                if (found_key.offset == btrfs_block_group_used(&cache->item)) {
8497                        cache->last_byte_to_unpin = (u64)-1;
8498                        cache->cached = BTRFS_CACHE_FINISHED;
8499                        free_excluded_extents(root, cache);
8500                } else if (btrfs_block_group_used(&cache->item) == 0) {
8501                        cache->last_byte_to_unpin = (u64)-1;
8502                        cache->cached = BTRFS_CACHE_FINISHED;
8503                        add_new_free_space(cache, root->fs_info,
8504                                           found_key.objectid,
8505                                           found_key.objectid +
8506                                           found_key.offset);
8507                        free_excluded_extents(root, cache);
8508                }
8509
8510                ret = btrfs_add_block_group_cache(root->fs_info, cache);
8511                if (ret) {
8512                        btrfs_remove_free_space_cache(cache);
8513                        btrfs_put_block_group(cache);
8514                        goto error;
8515                }
8516
8517                ret = update_space_info(info, cache->flags, found_key.offset,
8518                                        btrfs_block_group_used(&cache->item),
8519                                        &space_info);
8520                if (ret) {
8521                        btrfs_remove_free_space_cache(cache);
8522                        spin_lock(&info->block_group_cache_lock);
8523                        rb_erase(&cache->cache_node,
8524                                 &info->block_group_cache_tree);
8525                        spin_unlock(&info->block_group_cache_lock);
8526                        btrfs_put_block_group(cache);
8527                        goto error;
8528                }
8529
8530                cache->space_info = space_info;
8531                spin_lock(&cache->space_info->lock);
8532                cache->space_info->bytes_readonly += cache->bytes_super;
8533                spin_unlock(&cache->space_info->lock);
8534
8535                __link_block_group(space_info, cache);
8536
8537                set_avail_alloc_bits(root->fs_info, cache->flags);
8538                if (btrfs_chunk_readonly(root, cache->key.objectid))
8539                        set_block_group_ro(cache, 1);
8540        }
8541
8542        list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
8543                if (!(get_alloc_profile(root, space_info->flags) &
8544                      (BTRFS_BLOCK_GROUP_RAID10 |
8545                       BTRFS_BLOCK_GROUP_RAID1 |
8546                       BTRFS_BLOCK_GROUP_RAID5 |
8547                       BTRFS_BLOCK_GROUP_RAID6 |
8548                       BTRFS_BLOCK_GROUP_DUP)))
8549                        continue;
8550                /*
8551                 * avoid allocating from un-mirrored block group if there are
8552                 * mirrored block groups.
8553                 */
8554                list_for_each_entry(cache,
8555                                &space_info->block_groups[BTRFS_RAID_RAID0],
8556                                list)
8557                        set_block_group_ro(cache, 1);
8558                list_for_each_entry(cache,
8559                                &space_info->block_groups[BTRFS_RAID_SINGLE],
8560                                list)
8561                        set_block_group_ro(cache, 1);
8562        }
8563
8564        init_global_block_rsv(info);
8565        ret = 0;
8566error:
8567        btrfs_free_path(path);
8568        return ret;
8569}
8570
8571void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
8572                                       struct btrfs_root *root)
8573{
8574        struct btrfs_block_group_cache *block_group, *tmp;
8575        struct btrfs_root *extent_root = root->fs_info->extent_root;
8576        struct btrfs_block_group_item item;
8577        struct btrfs_key key;
8578        int ret = 0;
8579
8580        list_for_each_entry_safe(block_group, tmp, &trans->new_bgs,
8581                                 new_bg_list) {
8582                list_del_init(&block_group->new_bg_list);
8583
8584                if (ret)
8585                        continue;
8586
8587                spin_lock(&block_group->lock);
8588                memcpy(&item, &block_group->item, sizeof(item));
8589                memcpy(&key, &block_group->key, sizeof(key));
8590                spin_unlock(&block_group->lock);
8591
8592                ret = btrfs_insert_item(trans, extent_root, &key, &item,
8593                                        sizeof(item));
8594                if (ret)
8595                        btrfs_abort_transaction(trans, extent_root, ret);
8596                ret = btrfs_finish_chunk_alloc(trans, extent_root,
8597                                               key.objectid, key.offset);
8598                if (ret)
8599                        btrfs_abort_transaction(trans, extent_root, ret);
8600        }
8601}
8602
8603int btrfs_make_block_group(struct btrfs_trans_handle *trans,
8604                           struct btrfs_root *root, u64 bytes_used,
8605                           u64 type, u64 chunk_objectid, u64 chunk_offset,
8606                           u64 size)
8607{
8608        int ret;
8609        struct btrfs_root *extent_root;
8610        struct btrfs_block_group_cache *cache;
8611
8612        extent_root = root->fs_info->extent_root;
8613
8614        root->fs_info->last_trans_log_full_commit = trans->transid;
8615
8616        cache = btrfs_create_block_group_cache(root, chunk_offset, size);
8617        if (!cache)
8618                return -ENOMEM;
8619
8620        btrfs_set_block_group_used(&cache->item, bytes_used);
8621        btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
8622        btrfs_set_block_group_flags(&cache->item, type);
8623
8624        cache->flags = type;
8625        cache->last_byte_to_unpin = (u64)-1;
8626        cache->cached = BTRFS_CACHE_FINISHED;
8627        ret = exclude_super_stripes(root, cache);
8628        if (ret) {
8629                /*
8630                 * We may have excluded something, so call this just in
8631                 * case.
8632                 */
8633                free_excluded_extents(root, cache);
8634                btrfs_put_block_group(cache);
8635                return ret;
8636        }
8637
8638        add_new_free_space(cache, root->fs_info, chunk_offset,
8639                           chunk_offset + size);
8640
8641        free_excluded_extents(root, cache);
8642
8643        ret = btrfs_add_block_group_cache(root->fs_info, cache);
8644        if (ret) {
8645                btrfs_remove_free_space_cache(cache);
8646                btrfs_put_block_group(cache);
8647                return ret;
8648        }
8649
8650        ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
8651                                &cache->space_info);
8652        if (ret) {
8653                btrfs_remove_free_space_cache(cache);
8654                spin_lock(&root->fs_info->block_group_cache_lock);
8655                rb_erase(&cache->cache_node,
8656                         &root->fs_info->block_group_cache_tree);
8657                spin_unlock(&root->fs_info->block_group_cache_lock);
8658                btrfs_put_block_group(cache);
8659                return ret;
8660        }
8661        update_global_block_rsv(root->fs_info);
8662
8663        spin_lock(&cache->space_info->lock);
8664        cache->space_info->bytes_readonly += cache->bytes_super;
8665        spin_unlock(&cache->space_info->lock);
8666
8667        __link_block_group(cache->space_info, cache);
8668
8669        list_add_tail(&cache->new_bg_list, &trans->new_bgs);
8670
8671        set_avail_alloc_bits(extent_root->fs_info, type);
8672
8673        return 0;
8674}
8675
8676static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
8677{
8678        u64 extra_flags = chunk_to_extended(flags) &
8679                                BTRFS_EXTENDED_PROFILE_MASK;
8680
8681        write_seqlock(&fs_info->profiles_lock);
8682        if (flags & BTRFS_BLOCK_GROUP_DATA)
8683                fs_info->avail_data_alloc_bits &= ~extra_flags;
8684        if (flags & BTRFS_BLOCK_GROUP_METADATA)
8685                fs_info->avail_metadata_alloc_bits &= ~extra_flags;
8686        if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
8687                fs_info->avail_system_alloc_bits &= ~extra_flags;
8688        write_sequnlock(&fs_info->profiles_lock);
8689}
8690
8691int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
8692                             struct btrfs_root *root, u64 group_start)
8693{
8694        struct btrfs_path *path;
8695        struct btrfs_block_group_cache *block_group;
8696        struct btrfs_free_cluster *cluster;
8697        struct btrfs_root *tree_root = root->fs_info->tree_root;
8698        struct btrfs_key key;
8699        struct inode *inode;
8700        int ret;
8701        int index;
8702        int factor;
8703
8704        root = root->fs_info->extent_root;
8705
8706        block_group = btrfs_lookup_block_group(root->fs_info, group_start);
8707        BUG_ON(!block_group);
8708        BUG_ON(!block_group->ro);
8709
8710        /*
8711         * Free the reserved super bytes from this block group before
8712         * remove it.
8713         */
8714        free_excluded_extents(root, block_group);
8715
8716        memcpy(&key, &block_group->key, sizeof(key));
8717        index = get_block_group_index(block_group);
8718        if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
8719                                  BTRFS_BLOCK_GROUP_RAID1 |
8720                                  BTRFS_BLOCK_GROUP_RAID10))
8721                factor = 2;
8722        else
8723                factor = 1;
8724
8725        /* make sure this block group isn't part of an allocation cluster */
8726        cluster = &root->fs_info->data_alloc_cluster;
8727        spin_lock(&cluster->refill_lock);
8728        btrfs_return_cluster_to_free_space(block_group, cluster);
8729        spin_unlock(&cluster->refill_lock);
8730
8731        /*
8732         * make sure this block group isn't part of a metadata
8733         * allocation cluster
8734         */
8735        cluster = &root->fs_info->meta_alloc_cluster;
8736        spin_lock(&cluster->refill_lock);
8737        btrfs_return_cluster_to_free_space(block_group, cluster);
8738        spin_unlock(&cluster->refill_lock);
8739
8740        path = btrfs_alloc_path();
8741        if (!path) {
8742                ret = -ENOMEM;
8743                goto out;
8744        }
8745
8746        inode = lookup_free_space_inode(tree_root, block_group, path);
8747        if (!IS_ERR(inode)) {
8748                ret = btrfs_orphan_add(trans, inode);
8749                if (ret) {
8750                        btrfs_add_delayed_iput(inode);
8751                        goto out;
8752                }
8753                clear_nlink(inode);
8754                /* One for the block groups ref */
8755                spin_lock(&block_group->lock);
8756                if (block_group->iref) {
8757                        block_group->iref = 0;
8758                        block_group->inode = NULL;
8759                        spin_unlock(&block_group->lock);
8760                        iput(inode);
8761                } else {
8762                        spin_unlock(&block_group->lock);
8763                }
8764                /* One for our lookup ref */
8765                btrfs_add_delayed_iput(inode);
8766        }
8767
8768        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
8769        key.offset = block_group->key.objectid;
8770        key.type = 0;
8771
8772        ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
8773        if (ret < 0)
8774                goto out;
8775        if (ret > 0)
8776                btrfs_release_path(path);
8777        if (ret == 0) {
8778                ret = btrfs_del_item(trans, tree_root, path);
8779                if (ret)
8780                        goto out;
8781                btrfs_release_path(path);
8782        }
8783
8784        spin_lock(&root->fs_info->block_group_cache_lock);
8785        rb_erase(&block_group->cache_node,
8786                 &root->fs_info->block_group_cache_tree);
8787
8788        if (root->fs_info->first_logical_byte == block_group->key.objectid)
8789                root->fs_info->first_logical_byte = (u64)-1;
8790        spin_unlock(&root->fs_info->block_group_cache_lock);
8791
8792        down_write(&block_group->space_info->groups_sem);
8793        /*
8794         * we must use list_del_init so people can check to see if they
8795         * are still on the list after taking the semaphore
8796         */
8797        list_del_init(&block_group->list);
8798        if (list_empty(&block_group->space_info->block_groups[index])) {
8799                kobject_del(&block_group->space_info->block_group_kobjs[index]);
8800                kobject_put(&block_group->space_info->block_group_kobjs[index]);
8801                clear_avail_alloc_bits(root->fs_info, block_group->flags);
8802        }
8803        up_write(&block_group->space_info->groups_sem);
8804
8805        if (block_group->cached == BTRFS_CACHE_STARTED)
8806                wait_block_group_cache_done(block_group);
8807
8808        btrfs_remove_free_space_cache(block_group);
8809
8810        spin_lock(&block_group->space_info->lock);
8811        block_group->space_info->total_bytes -= block_group->key.offset;
8812        block_group->space_info->bytes_readonly -= block_group->key.offset;
8813        block_group->space_info->disk_total -= block_group->key.offset * factor;
8814        spin_unlock(&block_group->space_info->lock);
8815
8816        memcpy(&key, &block_group->key, sizeof(key));
8817
8818        btrfs_clear_space_info_full(root->fs_info);
8819
8820        btrfs_put_block_group(block_group);
8821        btrfs_put_block_group(block_group);
8822
8823        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
8824        if (ret > 0)
8825                ret = -EIO;
8826        if (ret < 0)
8827                goto out;
8828
8829        ret = btrfs_del_item(trans, root, path);
8830out:
8831        btrfs_free_path(path);
8832        return ret;
8833}
8834
8835int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
8836{
8837        struct btrfs_space_info *space_info;
8838        struct btrfs_super_block *disk_super;
8839        u64 features;
8840        u64 flags;
8841        int mixed = 0;
8842        int ret;
8843
8844        disk_super = fs_info->super_copy;
8845        if (!btrfs_super_root(disk_super))
8846                return 1;
8847
8848        features = btrfs_super_incompat_flags(disk_super);
8849        if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
8850                mixed = 1;
8851
8852        flags = BTRFS_BLOCK_GROUP_SYSTEM;
8853        ret = update_space_info(fs_info, flags, 0, 0, &space_info);
8854        if (ret)
8855                goto out;
8856
8857        if (mixed) {
8858                flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
8859                ret = update_space_info(fs_info, flags, 0, 0, &space_info);
8860        } else {
8861                flags = BTRFS_BLOCK_GROUP_METADATA;
8862                ret = update_space_info(fs_info, flags, 0, 0, &space_info);
8863                if (ret)
8864                        goto out;
8865
8866                flags = BTRFS_BLOCK_GROUP_DATA;
8867                ret = update_space_info(fs_info, flags, 0, 0, &space_info);
8868        }
8869out:
8870        return ret;
8871}
8872
8873int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
8874{
8875        return unpin_extent_range(root, start, end);
8876}
8877
8878int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
8879                               u64 num_bytes, u64 *actual_bytes)
8880{
8881        return btrfs_discard_extent(root, bytenr, num_bytes, actual_bytes);
8882}
8883
8884int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
8885{
8886        struct btrfs_fs_info *fs_info = root->fs_info;
8887        struct btrfs_block_group_cache *cache = NULL;
8888        u64 group_trimmed;
8889        u64 start;
8890        u64 end;
8891        u64 trimmed = 0;
8892        u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
8893        int ret = 0;
8894
8895        /*
8896         * try to trim all FS space, our block group may start from non-zero.
8897         */
8898        if (range->len == total_bytes)
8899                cache = btrfs_lookup_first_block_group(fs_info, range->start);
8900        else
8901                cache = btrfs_lookup_block_group(fs_info, range->start);
8902
8903        while (cache) {
8904                if (cache->key.objectid >= (range->start + range->len)) {
8905                        btrfs_put_block_group(cache);
8906                        break;
8907                }
8908
8909                start = max(range->start, cache->key.objectid);
8910                end = min(range->start + range->len,
8911                                cache->key.objectid + cache->key.offset);
8912
8913                if (end - start >= range->minlen) {
8914                        if (!block_group_cache_done(cache)) {
8915                                ret = cache_block_group(cache, 0);
8916                                if (ret) {
8917                                        btrfs_put_block_group(cache);
8918                                        break;
8919                                }
8920                                ret = wait_block_group_cache_done(cache);
8921                                if (ret) {
8922                                        btrfs_put_block_group(cache);
8923                                        break;
8924                                }
8925                        }
8926                        ret = btrfs_trim_block_group(cache,
8927                                                     &group_trimmed,
8928                                                     start,
8929                                                     end,
8930                                                     range->minlen);
8931
8932                        trimmed += group_trimmed;
8933                        if (ret) {
8934                                btrfs_put_block_group(cache);
8935                                break;
8936                        }
8937                }
8938
8939                cache = next_block_group(fs_info->tree_root, cache);
8940        }
8941
8942        range->len = trimmed;
8943        return ret;
8944}
8945
8946/*
8947 * btrfs_{start,end}_write() is similar to mnt_{want, drop}_write(),
8948 * they are used to prevent the some tasks writing data into the page cache
8949 * by nocow before the subvolume is snapshoted, but flush the data into
8950 * the disk after the snapshot creation.
8951 */
8952void btrfs_end_nocow_write(struct btrfs_root *root)
8953{
8954        percpu_counter_dec(&root->subv_writers->counter);
8955        /*
8956         * Make sure counter is updated before we wake up
8957         * waiters.
8958         */
8959        smp_mb();
8960        if (waitqueue_active(&root->subv_writers->wait))
8961                wake_up(&root->subv_writers->wait);
8962}
8963
8964int btrfs_start_nocow_write(struct btrfs_root *root)
8965{
8966        if (unlikely(atomic_read(&root->will_be_snapshoted)))
8967                return 0;
8968
8969        percpu_counter_inc(&root->subv_writers->counter);
8970        /*
8971         * Make sure counter is updated before we check for snapshot creation.
8972         */
8973        smp_mb();
8974        if (unlikely(atomic_read(&root->will_be_snapshoted))) {
8975                btrfs_end_nocow_write(root);
8976                return 0;
8977        }
8978        return 1;
8979}
8980