linux/fs/btrfs/extent-tree.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (C) 2007 Oracle.  All rights reserved.
   4 */
   5
   6#include <linux/sched.h>
   7#include <linux/sched/signal.h>
   8#include <linux/pagemap.h>
   9#include <linux/writeback.h>
  10#include <linux/blkdev.h>
  11#include <linux/sort.h>
  12#include <linux/rcupdate.h>
  13#include <linux/kthread.h>
  14#include <linux/slab.h>
  15#include <linux/ratelimit.h>
  16#include <linux/percpu_counter.h>
  17#include <linux/lockdep.h>
  18#include <linux/crc32c.h>
  19#include "misc.h"
  20#include "tree-log.h"
  21#include "disk-io.h"
  22#include "print-tree.h"
  23#include "volumes.h"
  24#include "raid56.h"
  25#include "locking.h"
  26#include "free-space-cache.h"
  27#include "free-space-tree.h"
  28#include "sysfs.h"
  29#include "qgroup.h"
  30#include "ref-verify.h"
  31#include "space-info.h"
  32#include "block-rsv.h"
  33#include "delalloc-space.h"
  34#include "block-group.h"
  35#include "discard.h"
  36#include "rcu-string.h"
  37#include "zoned.h"
  38#include "dev-replace.h"
  39
  40#undef SCRAMBLE_DELAYED_REFS
  41
  42
  43static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
  44                               struct btrfs_delayed_ref_node *node, u64 parent,
  45                               u64 root_objectid, u64 owner_objectid,
  46                               u64 owner_offset, int refs_to_drop,
  47                               struct btrfs_delayed_extent_op *extra_op);
  48static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
  49                                    struct extent_buffer *leaf,
  50                                    struct btrfs_extent_item *ei);
  51static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
  52                                      u64 parent, u64 root_objectid,
  53                                      u64 flags, u64 owner, u64 offset,
  54                                      struct btrfs_key *ins, int ref_mod);
  55static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
  56                                     struct btrfs_delayed_ref_node *node,
  57                                     struct btrfs_delayed_extent_op *extent_op);
  58static int find_next_key(struct btrfs_path *path, int level,
  59                         struct btrfs_key *key);
  60
  61static int block_group_bits(struct btrfs_block_group *cache, u64 bits)
  62{
  63        return (cache->flags & bits) == bits;
  64}
  65
  66int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info,
  67                              u64 start, u64 num_bytes)
  68{
  69        u64 end = start + num_bytes - 1;
  70        set_extent_bits(&fs_info->excluded_extents, start, end,
  71                        EXTENT_UPTODATE);
  72        return 0;
  73}
  74
  75void btrfs_free_excluded_extents(struct btrfs_block_group *cache)
  76{
  77        struct btrfs_fs_info *fs_info = cache->fs_info;
  78        u64 start, end;
  79
  80        start = cache->start;
  81        end = start + cache->length - 1;
  82
  83        clear_extent_bits(&fs_info->excluded_extents, start, end,
  84                          EXTENT_UPTODATE);
  85}
  86
  87/* simple helper to search for an existing data extent at a given offset */
  88int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
  89{
  90        int ret;
  91        struct btrfs_key key;
  92        struct btrfs_path *path;
  93
  94        path = btrfs_alloc_path();
  95        if (!path)
  96                return -ENOMEM;
  97
  98        key.objectid = start;
  99        key.offset = len;
 100        key.type = BTRFS_EXTENT_ITEM_KEY;
 101        ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
 102        btrfs_free_path(path);
 103        return ret;
 104}
 105
 106/*
 107 * helper function to lookup reference count and flags of a tree block.
 108 *
 109 * the head node for delayed ref is used to store the sum of all the
 110 * reference count modifications queued up in the rbtree. the head
 111 * node may also store the extent flags to set. This way you can check
 112 * to see what the reference count and extent flags would be if all of
 113 * the delayed refs are not processed.
 114 */
 115int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
 116                             struct btrfs_fs_info *fs_info, u64 bytenr,
 117                             u64 offset, int metadata, u64 *refs, u64 *flags)
 118{
 119        struct btrfs_delayed_ref_head *head;
 120        struct btrfs_delayed_ref_root *delayed_refs;
 121        struct btrfs_path *path;
 122        struct btrfs_extent_item *ei;
 123        struct extent_buffer *leaf;
 124        struct btrfs_key key;
 125        u32 item_size;
 126        u64 num_refs;
 127        u64 extent_flags;
 128        int ret;
 129
 130        /*
 131         * If we don't have skinny metadata, don't bother doing anything
 132         * different
 133         */
 134        if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
 135                offset = fs_info->nodesize;
 136                metadata = 0;
 137        }
 138
 139        path = btrfs_alloc_path();
 140        if (!path)
 141                return -ENOMEM;
 142
 143        if (!trans) {
 144                path->skip_locking = 1;
 145                path->search_commit_root = 1;
 146        }
 147
 148search_again:
 149        key.objectid = bytenr;
 150        key.offset = offset;
 151        if (metadata)
 152                key.type = BTRFS_METADATA_ITEM_KEY;
 153        else
 154                key.type = BTRFS_EXTENT_ITEM_KEY;
 155
 156        ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
 157        if (ret < 0)
 158                goto out_free;
 159
 160        if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
 161                if (path->slots[0]) {
 162                        path->slots[0]--;
 163                        btrfs_item_key_to_cpu(path->nodes[0], &key,
 164                                              path->slots[0]);
 165                        if (key.objectid == bytenr &&
 166                            key.type == BTRFS_EXTENT_ITEM_KEY &&
 167                            key.offset == fs_info->nodesize)
 168                                ret = 0;
 169                }
 170        }
 171
 172        if (ret == 0) {
 173                leaf = path->nodes[0];
 174                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
 175                if (item_size >= sizeof(*ei)) {
 176                        ei = btrfs_item_ptr(leaf, path->slots[0],
 177                                            struct btrfs_extent_item);
 178                        num_refs = btrfs_extent_refs(leaf, ei);
 179                        extent_flags = btrfs_extent_flags(leaf, ei);
 180                } else {
 181                        ret = -EINVAL;
 182                        btrfs_print_v0_err(fs_info);
 183                        if (trans)
 184                                btrfs_abort_transaction(trans, ret);
 185                        else
 186                                btrfs_handle_fs_error(fs_info, ret, NULL);
 187
 188                        goto out_free;
 189                }
 190
 191                BUG_ON(num_refs == 0);
 192        } else {
 193                num_refs = 0;
 194                extent_flags = 0;
 195                ret = 0;
 196        }
 197
 198        if (!trans)
 199                goto out;
 200
 201        delayed_refs = &trans->transaction->delayed_refs;
 202        spin_lock(&delayed_refs->lock);
 203        head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
 204        if (head) {
 205                if (!mutex_trylock(&head->mutex)) {
 206                        refcount_inc(&head->refs);
 207                        spin_unlock(&delayed_refs->lock);
 208
 209                        btrfs_release_path(path);
 210
 211                        /*
 212                         * Mutex was contended, block until it's released and try
 213                         * again
 214                         */
 215                        mutex_lock(&head->mutex);
 216                        mutex_unlock(&head->mutex);
 217                        btrfs_put_delayed_ref_head(head);
 218                        goto search_again;
 219                }
 220                spin_lock(&head->lock);
 221                if (head->extent_op && head->extent_op->update_flags)
 222                        extent_flags |= head->extent_op->flags_to_set;
 223                else
 224                        BUG_ON(num_refs == 0);
 225
 226                num_refs += head->ref_mod;
 227                spin_unlock(&head->lock);
 228                mutex_unlock(&head->mutex);
 229        }
 230        spin_unlock(&delayed_refs->lock);
 231out:
 232        WARN_ON(num_refs == 0);
 233        if (refs)
 234                *refs = num_refs;
 235        if (flags)
 236                *flags = extent_flags;
 237out_free:
 238        btrfs_free_path(path);
 239        return ret;
 240}
 241
 242/*
 243 * Back reference rules.  Back refs have three main goals:
 244 *
 245 * 1) differentiate between all holders of references to an extent so that
 246 *    when a reference is dropped we can make sure it was a valid reference
 247 *    before freeing the extent.
 248 *
 249 * 2) Provide enough information to quickly find the holders of an extent
 250 *    if we notice a given block is corrupted or bad.
 251 *
 252 * 3) Make it easy to migrate blocks for FS shrinking or storage pool
 253 *    maintenance.  This is actually the same as #2, but with a slightly
 254 *    different use case.
 255 *
 256 * There are two kinds of back refs. The implicit back refs is optimized
 257 * for pointers in non-shared tree blocks. For a given pointer in a block,
 258 * back refs of this kind provide information about the block's owner tree
 259 * and the pointer's key. These information allow us to find the block by
 260 * b-tree searching. The full back refs is for pointers in tree blocks not
 261 * referenced by their owner trees. The location of tree block is recorded
 262 * in the back refs. Actually the full back refs is generic, and can be
 263 * used in all cases the implicit back refs is used. The major shortcoming
 264 * of the full back refs is its overhead. Every time a tree block gets
 265 * COWed, we have to update back refs entry for all pointers in it.
 266 *
 267 * For a newly allocated tree block, we use implicit back refs for
 268 * pointers in it. This means most tree related operations only involve
 269 * implicit back refs. For a tree block created in old transaction, the
 270 * only way to drop a reference to it is COW it. So we can detect the
 271 * event that tree block loses its owner tree's reference and do the
 272 * back refs conversion.
 273 *
 274 * When a tree block is COWed through a tree, there are four cases:
 275 *
 276 * The reference count of the block is one and the tree is the block's
 277 * owner tree. Nothing to do in this case.
 278 *
 279 * The reference count of the block is one and the tree is not the
 280 * block's owner tree. In this case, full back refs is used for pointers
 281 * in the block. Remove these full back refs, add implicit back refs for
 282 * every pointers in the new block.
 283 *
 284 * The reference count of the block is greater than one and the tree is
 285 * the block's owner tree. In this case, implicit back refs is used for
 286 * pointers in the block. Add full back refs for every pointers in the
 287 * block, increase lower level extents' reference counts. The original
 288 * implicit back refs are entailed to the new block.
 289 *
 290 * The reference count of the block is greater than one and the tree is
 291 * not the block's owner tree. Add implicit back refs for every pointer in
 292 * the new block, increase lower level extents' reference count.
 293 *
 294 * Back Reference Key composing:
 295 *
 296 * The key objectid corresponds to the first byte in the extent,
 297 * The key type is used to differentiate between types of back refs.
 298 * There are different meanings of the key offset for different types
 299 * of back refs.
 300 *
 301 * File extents can be referenced by:
 302 *
 303 * - multiple snapshots, subvolumes, or different generations in one subvol
 304 * - different files inside a single subvolume
 305 * - different offsets inside a file (bookend extents in file.c)
 306 *
 307 * The extent ref structure for the implicit back refs has fields for:
 308 *
 309 * - Objectid of the subvolume root
 310 * - objectid of the file holding the reference
 311 * - original offset in the file
 312 * - how many bookend extents
 313 *
 314 * The key offset for the implicit back refs is hash of the first
 315 * three fields.
 316 *
 317 * The extent ref structure for the full back refs has field for:
 318 *
 319 * - number of pointers in the tree leaf
 320 *
 321 * The key offset for the implicit back refs is the first byte of
 322 * the tree leaf
 323 *
 324 * When a file extent is allocated, The implicit back refs is used.
 325 * the fields are filled in:
 326 *
 327 *     (root_key.objectid, inode objectid, offset in file, 1)
 328 *
 329 * When a file extent is removed file truncation, we find the
 330 * corresponding implicit back refs and check the following fields:
 331 *
 332 *     (btrfs_header_owner(leaf), inode objectid, offset in file)
 333 *
 334 * Btree extents can be referenced by:
 335 *
 336 * - Different subvolumes
 337 *
 338 * Both the implicit back refs and the full back refs for tree blocks
 339 * only consist of key. The key offset for the implicit back refs is
 340 * objectid of block's owner tree. The key offset for the full back refs
 341 * is the first byte of parent block.
 342 *
 343 * When implicit back refs is used, information about the lowest key and
 344 * level of the tree block are required. These information are stored in
 345 * tree block info structure.
 346 */
 347
 348/*
 349 * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
 350 * is_data == BTRFS_REF_TYPE_DATA, data type is requiried,
 351 * is_data == BTRFS_REF_TYPE_ANY, either type is OK.
 352 */
 353int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
 354                                     struct btrfs_extent_inline_ref *iref,
 355                                     enum btrfs_inline_ref_type is_data)
 356{
 357        int type = btrfs_extent_inline_ref_type(eb, iref);
 358        u64 offset = btrfs_extent_inline_ref_offset(eb, iref);
 359
 360        if (type == BTRFS_TREE_BLOCK_REF_KEY ||
 361            type == BTRFS_SHARED_BLOCK_REF_KEY ||
 362            type == BTRFS_SHARED_DATA_REF_KEY ||
 363            type == BTRFS_EXTENT_DATA_REF_KEY) {
 364                if (is_data == BTRFS_REF_TYPE_BLOCK) {
 365                        if (type == BTRFS_TREE_BLOCK_REF_KEY)
 366                                return type;
 367                        if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
 368                                ASSERT(eb->fs_info);
 369                                /*
 370                                 * Every shared one has parent tree block,
 371                                 * which must be aligned to sector size.
 372                                 */
 373                                if (offset &&
 374                                    IS_ALIGNED(offset, eb->fs_info->sectorsize))
 375                                        return type;
 376                        }
 377                } else if (is_data == BTRFS_REF_TYPE_DATA) {
 378                        if (type == BTRFS_EXTENT_DATA_REF_KEY)
 379                                return type;
 380                        if (type == BTRFS_SHARED_DATA_REF_KEY) {
 381                                ASSERT(eb->fs_info);
 382                                /*
 383                                 * Every shared one has parent tree block,
 384                                 * which must be aligned to sector size.
 385                                 */
 386                                if (offset &&
 387                                    IS_ALIGNED(offset, eb->fs_info->sectorsize))
 388                                        return type;
 389                        }
 390                } else {
 391                        ASSERT(is_data == BTRFS_REF_TYPE_ANY);
 392                        return type;
 393                }
 394        }
 395
 396        btrfs_print_leaf((struct extent_buffer *)eb);
 397        btrfs_err(eb->fs_info,
 398                  "eb %llu iref 0x%lx invalid extent inline ref type %d",
 399                  eb->start, (unsigned long)iref, type);
 400        WARN_ON(1);
 401
 402        return BTRFS_REF_TYPE_INVALID;
 403}
 404
 405u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
 406{
 407        u32 high_crc = ~(u32)0;
 408        u32 low_crc = ~(u32)0;
 409        __le64 lenum;
 410
 411        lenum = cpu_to_le64(root_objectid);
 412        high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
 413        lenum = cpu_to_le64(owner);
 414        low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
 415        lenum = cpu_to_le64(offset);
 416        low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
 417
 418        return ((u64)high_crc << 31) ^ (u64)low_crc;
 419}
 420
 421static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
 422                                     struct btrfs_extent_data_ref *ref)
 423{
 424        return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
 425                                    btrfs_extent_data_ref_objectid(leaf, ref),
 426                                    btrfs_extent_data_ref_offset(leaf, ref));
 427}
 428
 429static int match_extent_data_ref(struct extent_buffer *leaf,
 430                                 struct btrfs_extent_data_ref *ref,
 431                                 u64 root_objectid, u64 owner, u64 offset)
 432{
 433        if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
 434            btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
 435            btrfs_extent_data_ref_offset(leaf, ref) != offset)
 436                return 0;
 437        return 1;
 438}
 439
 440static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
 441                                           struct btrfs_path *path,
 442                                           u64 bytenr, u64 parent,
 443                                           u64 root_objectid,
 444                                           u64 owner, u64 offset)
 445{
 446        struct btrfs_root *root = trans->fs_info->extent_root;
 447        struct btrfs_key key;
 448        struct btrfs_extent_data_ref *ref;
 449        struct extent_buffer *leaf;
 450        u32 nritems;
 451        int ret;
 452        int recow;
 453        int err = -ENOENT;
 454
 455        key.objectid = bytenr;
 456        if (parent) {
 457                key.type = BTRFS_SHARED_DATA_REF_KEY;
 458                key.offset = parent;
 459        } else {
 460                key.type = BTRFS_EXTENT_DATA_REF_KEY;
 461                key.offset = hash_extent_data_ref(root_objectid,
 462                                                  owner, offset);
 463        }
 464again:
 465        recow = 0;
 466        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 467        if (ret < 0) {
 468                err = ret;
 469                goto fail;
 470        }
 471
 472        if (parent) {
 473                if (!ret)
 474                        return 0;
 475                goto fail;
 476        }
 477
 478        leaf = path->nodes[0];
 479        nritems = btrfs_header_nritems(leaf);
 480        while (1) {
 481                if (path->slots[0] >= nritems) {
 482                        ret = btrfs_next_leaf(root, path);
 483                        if (ret < 0)
 484                                err = ret;
 485                        if (ret)
 486                                goto fail;
 487
 488                        leaf = path->nodes[0];
 489                        nritems = btrfs_header_nritems(leaf);
 490                        recow = 1;
 491                }
 492
 493                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 494                if (key.objectid != bytenr ||
 495                    key.type != BTRFS_EXTENT_DATA_REF_KEY)
 496                        goto fail;
 497
 498                ref = btrfs_item_ptr(leaf, path->slots[0],
 499                                     struct btrfs_extent_data_ref);
 500
 501                if (match_extent_data_ref(leaf, ref, root_objectid,
 502                                          owner, offset)) {
 503                        if (recow) {
 504                                btrfs_release_path(path);
 505                                goto again;
 506                        }
 507                        err = 0;
 508                        break;
 509                }
 510                path->slots[0]++;
 511        }
 512fail:
 513        return err;
 514}
 515
 516static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
 517                                           struct btrfs_path *path,
 518                                           u64 bytenr, u64 parent,
 519                                           u64 root_objectid, u64 owner,
 520                                           u64 offset, int refs_to_add)
 521{
 522        struct btrfs_root *root = trans->fs_info->extent_root;
 523        struct btrfs_key key;
 524        struct extent_buffer *leaf;
 525        u32 size;
 526        u32 num_refs;
 527        int ret;
 528
 529        key.objectid = bytenr;
 530        if (parent) {
 531                key.type = BTRFS_SHARED_DATA_REF_KEY;
 532                key.offset = parent;
 533                size = sizeof(struct btrfs_shared_data_ref);
 534        } else {
 535                key.type = BTRFS_EXTENT_DATA_REF_KEY;
 536                key.offset = hash_extent_data_ref(root_objectid,
 537                                                  owner, offset);
 538                size = sizeof(struct btrfs_extent_data_ref);
 539        }
 540
 541        ret = btrfs_insert_empty_item(trans, root, path, &key, size);
 542        if (ret && ret != -EEXIST)
 543                goto fail;
 544
 545        leaf = path->nodes[0];
 546        if (parent) {
 547                struct btrfs_shared_data_ref *ref;
 548                ref = btrfs_item_ptr(leaf, path->slots[0],
 549                                     struct btrfs_shared_data_ref);
 550                if (ret == 0) {
 551                        btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
 552                } else {
 553                        num_refs = btrfs_shared_data_ref_count(leaf, ref);
 554                        num_refs += refs_to_add;
 555                        btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
 556                }
 557        } else {
 558                struct btrfs_extent_data_ref *ref;
 559                while (ret == -EEXIST) {
 560                        ref = btrfs_item_ptr(leaf, path->slots[0],
 561                                             struct btrfs_extent_data_ref);
 562                        if (match_extent_data_ref(leaf, ref, root_objectid,
 563                                                  owner, offset))
 564                                break;
 565                        btrfs_release_path(path);
 566                        key.offset++;
 567                        ret = btrfs_insert_empty_item(trans, root, path, &key,
 568                                                      size);
 569                        if (ret && ret != -EEXIST)
 570                                goto fail;
 571
 572                        leaf = path->nodes[0];
 573                }
 574                ref = btrfs_item_ptr(leaf, path->slots[0],
 575                                     struct btrfs_extent_data_ref);
 576                if (ret == 0) {
 577                        btrfs_set_extent_data_ref_root(leaf, ref,
 578                                                       root_objectid);
 579                        btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
 580                        btrfs_set_extent_data_ref_offset(leaf, ref, offset);
 581                        btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
 582                } else {
 583                        num_refs = btrfs_extent_data_ref_count(leaf, ref);
 584                        num_refs += refs_to_add;
 585                        btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
 586                }
 587        }
 588        btrfs_mark_buffer_dirty(leaf);
 589        ret = 0;
 590fail:
 591        btrfs_release_path(path);
 592        return ret;
 593}
 594
 595static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
 596                                           struct btrfs_path *path,
 597                                           int refs_to_drop, int *last_ref)
 598{
 599        struct btrfs_key key;
 600        struct btrfs_extent_data_ref *ref1 = NULL;
 601        struct btrfs_shared_data_ref *ref2 = NULL;
 602        struct extent_buffer *leaf;
 603        u32 num_refs = 0;
 604        int ret = 0;
 605
 606        leaf = path->nodes[0];
 607        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 608
 609        if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
 610                ref1 = btrfs_item_ptr(leaf, path->slots[0],
 611                                      struct btrfs_extent_data_ref);
 612                num_refs = btrfs_extent_data_ref_count(leaf, ref1);
 613        } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
 614                ref2 = btrfs_item_ptr(leaf, path->slots[0],
 615                                      struct btrfs_shared_data_ref);
 616                num_refs = btrfs_shared_data_ref_count(leaf, ref2);
 617        } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) {
 618                btrfs_print_v0_err(trans->fs_info);
 619                btrfs_abort_transaction(trans, -EINVAL);
 620                return -EINVAL;
 621        } else {
 622                BUG();
 623        }
 624
 625        BUG_ON(num_refs < refs_to_drop);
 626        num_refs -= refs_to_drop;
 627
 628        if (num_refs == 0) {
 629                ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
 630                *last_ref = 1;
 631        } else {
 632                if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
 633                        btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
 634                else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
 635                        btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
 636                btrfs_mark_buffer_dirty(leaf);
 637        }
 638        return ret;
 639}
 640
 641static noinline u32 extent_data_ref_count(struct btrfs_path *path,
 642                                          struct btrfs_extent_inline_ref *iref)
 643{
 644        struct btrfs_key key;
 645        struct extent_buffer *leaf;
 646        struct btrfs_extent_data_ref *ref1;
 647        struct btrfs_shared_data_ref *ref2;
 648        u32 num_refs = 0;
 649        int type;
 650
 651        leaf = path->nodes[0];
 652        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 653
 654        BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
 655        if (iref) {
 656                /*
 657                 * If type is invalid, we should have bailed out earlier than
 658                 * this call.
 659                 */
 660                type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
 661                ASSERT(type != BTRFS_REF_TYPE_INVALID);
 662                if (type == BTRFS_EXTENT_DATA_REF_KEY) {
 663                        ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
 664                        num_refs = btrfs_extent_data_ref_count(leaf, ref1);
 665                } else {
 666                        ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
 667                        num_refs = btrfs_shared_data_ref_count(leaf, ref2);
 668                }
 669        } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
 670                ref1 = btrfs_item_ptr(leaf, path->slots[0],
 671                                      struct btrfs_extent_data_ref);
 672                num_refs = btrfs_extent_data_ref_count(leaf, ref1);
 673        } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
 674                ref2 = btrfs_item_ptr(leaf, path->slots[0],
 675                                      struct btrfs_shared_data_ref);
 676                num_refs = btrfs_shared_data_ref_count(leaf, ref2);
 677        } else {
 678                WARN_ON(1);
 679        }
 680        return num_refs;
 681}
 682
 683static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
 684                                          struct btrfs_path *path,
 685                                          u64 bytenr, u64 parent,
 686                                          u64 root_objectid)
 687{
 688        struct btrfs_root *root = trans->fs_info->extent_root;
 689        struct btrfs_key key;
 690        int ret;
 691
 692        key.objectid = bytenr;
 693        if (parent) {
 694                key.type = BTRFS_SHARED_BLOCK_REF_KEY;
 695                key.offset = parent;
 696        } else {
 697                key.type = BTRFS_TREE_BLOCK_REF_KEY;
 698                key.offset = root_objectid;
 699        }
 700
 701        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 702        if (ret > 0)
 703                ret = -ENOENT;
 704        return ret;
 705}
 706
 707static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
 708                                          struct btrfs_path *path,
 709                                          u64 bytenr, u64 parent,
 710                                          u64 root_objectid)
 711{
 712        struct btrfs_key key;
 713        int ret;
 714
 715        key.objectid = bytenr;
 716        if (parent) {
 717                key.type = BTRFS_SHARED_BLOCK_REF_KEY;
 718                key.offset = parent;
 719        } else {
 720                key.type = BTRFS_TREE_BLOCK_REF_KEY;
 721                key.offset = root_objectid;
 722        }
 723
 724        ret = btrfs_insert_empty_item(trans, trans->fs_info->extent_root,
 725                                      path, &key, 0);
 726        btrfs_release_path(path);
 727        return ret;
 728}
 729
 730static inline int extent_ref_type(u64 parent, u64 owner)
 731{
 732        int type;
 733        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
 734                if (parent > 0)
 735                        type = BTRFS_SHARED_BLOCK_REF_KEY;
 736                else
 737                        type = BTRFS_TREE_BLOCK_REF_KEY;
 738        } else {
 739                if (parent > 0)
 740                        type = BTRFS_SHARED_DATA_REF_KEY;
 741                else
 742                        type = BTRFS_EXTENT_DATA_REF_KEY;
 743        }
 744        return type;
 745}
 746
 747static int find_next_key(struct btrfs_path *path, int level,
 748                         struct btrfs_key *key)
 749
 750{
 751        for (; level < BTRFS_MAX_LEVEL; level++) {
 752                if (!path->nodes[level])
 753                        break;
 754                if (path->slots[level] + 1 >=
 755                    btrfs_header_nritems(path->nodes[level]))
 756                        continue;
 757                if (level == 0)
 758                        btrfs_item_key_to_cpu(path->nodes[level], key,
 759                                              path->slots[level] + 1);
 760                else
 761                        btrfs_node_key_to_cpu(path->nodes[level], key,
 762                                              path->slots[level] + 1);
 763                return 0;
 764        }
 765        return 1;
 766}
 767
 768/*
 769 * look for inline back ref. if back ref is found, *ref_ret is set
 770 * to the address of inline back ref, and 0 is returned.
 771 *
 772 * if back ref isn't found, *ref_ret is set to the address where it
 773 * should be inserted, and -ENOENT is returned.
 774 *
 775 * if insert is true and there are too many inline back refs, the path
 776 * points to the extent item, and -EAGAIN is returned.
 777 *
 778 * NOTE: inline back refs are ordered in the same way that back ref
 779 *       items in the tree are ordered.
 780 */
 781static noinline_for_stack
 782int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
 783                                 struct btrfs_path *path,
 784                                 struct btrfs_extent_inline_ref **ref_ret,
 785                                 u64 bytenr, u64 num_bytes,
 786                                 u64 parent, u64 root_objectid,
 787                                 u64 owner, u64 offset, int insert)
 788{
 789        struct btrfs_fs_info *fs_info = trans->fs_info;
 790        struct btrfs_root *root = fs_info->extent_root;
 791        struct btrfs_key key;
 792        struct extent_buffer *leaf;
 793        struct btrfs_extent_item *ei;
 794        struct btrfs_extent_inline_ref *iref;
 795        u64 flags;
 796        u64 item_size;
 797        unsigned long ptr;
 798        unsigned long end;
 799        int extra_size;
 800        int type;
 801        int want;
 802        int ret;
 803        int err = 0;
 804        bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
 805        int needed;
 806
 807        key.objectid = bytenr;
 808        key.type = BTRFS_EXTENT_ITEM_KEY;
 809        key.offset = num_bytes;
 810
 811        want = extent_ref_type(parent, owner);
 812        if (insert) {
 813                extra_size = btrfs_extent_inline_ref_size(want);
 814                path->search_for_extension = 1;
 815                path->keep_locks = 1;
 816        } else
 817                extra_size = -1;
 818
 819        /*
 820         * Owner is our level, so we can just add one to get the level for the
 821         * block we are interested in.
 822         */
 823        if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
 824                key.type = BTRFS_METADATA_ITEM_KEY;
 825                key.offset = owner;
 826        }
 827
 828again:
 829        ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
 830        if (ret < 0) {
 831                err = ret;
 832                goto out;
 833        }
 834
 835        /*
 836         * We may be a newly converted file system which still has the old fat
 837         * extent entries for metadata, so try and see if we have one of those.
 838         */
 839        if (ret > 0 && skinny_metadata) {
 840                skinny_metadata = false;
 841                if (path->slots[0]) {
 842                        path->slots[0]--;
 843                        btrfs_item_key_to_cpu(path->nodes[0], &key,
 844                                              path->slots[0]);
 845                        if (key.objectid == bytenr &&
 846                            key.type == BTRFS_EXTENT_ITEM_KEY &&
 847                            key.offset == num_bytes)
 848                                ret = 0;
 849                }
 850                if (ret) {
 851                        key.objectid = bytenr;
 852                        key.type = BTRFS_EXTENT_ITEM_KEY;
 853                        key.offset = num_bytes;
 854                        btrfs_release_path(path);
 855                        goto again;
 856                }
 857        }
 858
 859        if (ret && !insert) {
 860                err = -ENOENT;
 861                goto out;
 862        } else if (WARN_ON(ret)) {
 863                err = -EIO;
 864                goto out;
 865        }
 866
 867        leaf = path->nodes[0];
 868        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
 869        if (unlikely(item_size < sizeof(*ei))) {
 870                err = -EINVAL;
 871                btrfs_print_v0_err(fs_info);
 872                btrfs_abort_transaction(trans, err);
 873                goto out;
 874        }
 875
 876        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
 877        flags = btrfs_extent_flags(leaf, ei);
 878
 879        ptr = (unsigned long)(ei + 1);
 880        end = (unsigned long)ei + item_size;
 881
 882        if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
 883                ptr += sizeof(struct btrfs_tree_block_info);
 884                BUG_ON(ptr > end);
 885        }
 886
 887        if (owner >= BTRFS_FIRST_FREE_OBJECTID)
 888                needed = BTRFS_REF_TYPE_DATA;
 889        else
 890                needed = BTRFS_REF_TYPE_BLOCK;
 891
 892        err = -ENOENT;
 893        while (1) {
 894                if (ptr >= end) {
 895                        WARN_ON(ptr > end);
 896                        break;
 897                }
 898                iref = (struct btrfs_extent_inline_ref *)ptr;
 899                type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
 900                if (type == BTRFS_REF_TYPE_INVALID) {
 901                        err = -EUCLEAN;
 902                        goto out;
 903                }
 904
 905                if (want < type)
 906                        break;
 907                if (want > type) {
 908                        ptr += btrfs_extent_inline_ref_size(type);
 909                        continue;
 910                }
 911
 912                if (type == BTRFS_EXTENT_DATA_REF_KEY) {
 913                        struct btrfs_extent_data_ref *dref;
 914                        dref = (struct btrfs_extent_data_ref *)(&iref->offset);
 915                        if (match_extent_data_ref(leaf, dref, root_objectid,
 916                                                  owner, offset)) {
 917                                err = 0;
 918                                break;
 919                        }
 920                        if (hash_extent_data_ref_item(leaf, dref) <
 921                            hash_extent_data_ref(root_objectid, owner, offset))
 922                                break;
 923                } else {
 924                        u64 ref_offset;
 925                        ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
 926                        if (parent > 0) {
 927                                if (parent == ref_offset) {
 928                                        err = 0;
 929                                        break;
 930                                }
 931                                if (ref_offset < parent)
 932                                        break;
 933                        } else {
 934                                if (root_objectid == ref_offset) {
 935                                        err = 0;
 936                                        break;
 937                                }
 938                                if (ref_offset < root_objectid)
 939                                        break;
 940                        }
 941                }
 942                ptr += btrfs_extent_inline_ref_size(type);
 943        }
 944        if (err == -ENOENT && insert) {
 945                if (item_size + extra_size >=
 946                    BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
 947                        err = -EAGAIN;
 948                        goto out;
 949                }
 950                /*
 951                 * To add new inline back ref, we have to make sure
 952                 * there is no corresponding back ref item.
 953                 * For simplicity, we just do not add new inline back
 954                 * ref if there is any kind of item for this block
 955                 */
 956                if (find_next_key(path, 0, &key) == 0 &&
 957                    key.objectid == bytenr &&
 958                    key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
 959                        err = -EAGAIN;
 960                        goto out;
 961                }
 962        }
 963        *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
 964out:
 965        if (insert) {
 966                path->keep_locks = 0;
 967                path->search_for_extension = 0;
 968                btrfs_unlock_up_safe(path, 1);
 969        }
 970        return err;
 971}
 972
 973/*
 974 * helper to add new inline back ref
 975 */
 976static noinline_for_stack
 977void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
 978                                 struct btrfs_path *path,
 979                                 struct btrfs_extent_inline_ref *iref,
 980                                 u64 parent, u64 root_objectid,
 981                                 u64 owner, u64 offset, int refs_to_add,
 982                                 struct btrfs_delayed_extent_op *extent_op)
 983{
 984        struct extent_buffer *leaf;
 985        struct btrfs_extent_item *ei;
 986        unsigned long ptr;
 987        unsigned long end;
 988        unsigned long item_offset;
 989        u64 refs;
 990        int size;
 991        int type;
 992
 993        leaf = path->nodes[0];
 994        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
 995        item_offset = (unsigned long)iref - (unsigned long)ei;
 996
 997        type = extent_ref_type(parent, owner);
 998        size = btrfs_extent_inline_ref_size(type);
 999
1000        btrfs_extend_item(path, size);
1001
1002        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1003        refs = btrfs_extent_refs(leaf, ei);
1004        refs += refs_to_add;
1005        btrfs_set_extent_refs(leaf, ei, refs);
1006        if (extent_op)
1007                __run_delayed_extent_op(extent_op, leaf, ei);
1008
1009        ptr = (unsigned long)ei + item_offset;
1010        end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1011        if (ptr < end - size)
1012                memmove_extent_buffer(leaf, ptr + size, ptr,
1013                                      end - size - ptr);
1014
1015        iref = (struct btrfs_extent_inline_ref *)ptr;
1016        btrfs_set_extent_inline_ref_type(leaf, iref, type);
1017        if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1018                struct btrfs_extent_data_ref *dref;
1019                dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1020                btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1021                btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1022                btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1023                btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1024        } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1025                struct btrfs_shared_data_ref *sref;
1026                sref = (struct btrfs_shared_data_ref *)(iref + 1);
1027                btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1028                btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1029        } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1030                btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1031        } else {
1032                btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1033        }
1034        btrfs_mark_buffer_dirty(leaf);
1035}
1036
1037static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1038                                 struct btrfs_path *path,
1039                                 struct btrfs_extent_inline_ref **ref_ret,
1040                                 u64 bytenr, u64 num_bytes, u64 parent,
1041                                 u64 root_objectid, u64 owner, u64 offset)
1042{
1043        int ret;
1044
1045        ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr,
1046                                           num_bytes, parent, root_objectid,
1047                                           owner, offset, 0);
1048        if (ret != -ENOENT)
1049                return ret;
1050
1051        btrfs_release_path(path);
1052        *ref_ret = NULL;
1053
1054        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1055                ret = lookup_tree_block_ref(trans, path, bytenr, parent,
1056                                            root_objectid);
1057        } else {
1058                ret = lookup_extent_data_ref(trans, path, bytenr, parent,
1059                                             root_objectid, owner, offset);
1060        }
1061        return ret;
1062}
1063
1064/*
1065 * helper to update/remove inline back ref
1066 */
1067static noinline_for_stack
1068void update_inline_extent_backref(struct btrfs_path *path,
1069                                  struct btrfs_extent_inline_ref *iref,
1070                                  int refs_to_mod,
1071                                  struct btrfs_delayed_extent_op *extent_op,
1072                                  int *last_ref)
1073{
1074        struct extent_buffer *leaf = path->nodes[0];
1075        struct btrfs_extent_item *ei;
1076        struct btrfs_extent_data_ref *dref = NULL;
1077        struct btrfs_shared_data_ref *sref = NULL;
1078        unsigned long ptr;
1079        unsigned long end;
1080        u32 item_size;
1081        int size;
1082        int type;
1083        u64 refs;
1084
1085        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1086        refs = btrfs_extent_refs(leaf, ei);
1087        WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1088        refs += refs_to_mod;
1089        btrfs_set_extent_refs(leaf, ei, refs);
1090        if (extent_op)
1091                __run_delayed_extent_op(extent_op, leaf, ei);
1092
1093        /*
1094         * If type is invalid, we should have bailed out after
1095         * lookup_inline_extent_backref().
1096         */
1097        type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
1098        ASSERT(type != BTRFS_REF_TYPE_INVALID);
1099
1100        if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1101                dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1102                refs = btrfs_extent_data_ref_count(leaf, dref);
1103        } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1104                sref = (struct btrfs_shared_data_ref *)(iref + 1);
1105                refs = btrfs_shared_data_ref_count(leaf, sref);
1106        } else {
1107                refs = 1;
1108                BUG_ON(refs_to_mod != -1);
1109        }
1110
1111        BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1112        refs += refs_to_mod;
1113
1114        if (refs > 0) {
1115                if (type == BTRFS_EXTENT_DATA_REF_KEY)
1116                        btrfs_set_extent_data_ref_count(leaf, dref, refs);
1117                else
1118                        btrfs_set_shared_data_ref_count(leaf, sref, refs);
1119        } else {
1120                *last_ref = 1;
1121                size =  btrfs_extent_inline_ref_size(type);
1122                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1123                ptr = (unsigned long)iref;
1124                end = (unsigned long)ei + item_size;
1125                if (ptr + size < end)
1126                        memmove_extent_buffer(leaf, ptr, ptr + size,
1127                                              end - ptr - size);
1128                item_size -= size;
1129                btrfs_truncate_item(path, item_size, 1);
1130        }
1131        btrfs_mark_buffer_dirty(leaf);
1132}
1133
1134static noinline_for_stack
1135int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1136                                 struct btrfs_path *path,
1137                                 u64 bytenr, u64 num_bytes, u64 parent,
1138                                 u64 root_objectid, u64 owner,
1139                                 u64 offset, int refs_to_add,
1140                                 struct btrfs_delayed_extent_op *extent_op)
1141{
1142        struct btrfs_extent_inline_ref *iref;
1143        int ret;
1144
1145        ret = lookup_inline_extent_backref(trans, path, &iref, bytenr,
1146                                           num_bytes, parent, root_objectid,
1147                                           owner, offset, 1);
1148        if (ret == 0) {
1149                /*
1150                 * We're adding refs to a tree block we already own, this
1151                 * should not happen at all.
1152                 */
1153                if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1154                        btrfs_crit(trans->fs_info,
1155"adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu",
1156                                   bytenr, num_bytes, root_objectid);
1157                        if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) {
1158                                WARN_ON(1);
1159                                btrfs_crit(trans->fs_info,
1160                        "path->slots[0]=%d path->nodes[0]:", path->slots[0]);
1161                                btrfs_print_leaf(path->nodes[0]);
1162                        }
1163                        return -EUCLEAN;
1164                }
1165                update_inline_extent_backref(path, iref, refs_to_add,
1166                                             extent_op, NULL);
1167        } else if (ret == -ENOENT) {
1168                setup_inline_extent_backref(trans->fs_info, path, iref, parent,
1169                                            root_objectid, owner, offset,
1170                                            refs_to_add, extent_op);
1171                ret = 0;
1172        }
1173        return ret;
1174}
1175
1176static int remove_extent_backref(struct btrfs_trans_handle *trans,
1177                                 struct btrfs_path *path,
1178                                 struct btrfs_extent_inline_ref *iref,
1179                                 int refs_to_drop, int is_data, int *last_ref)
1180{
1181        int ret = 0;
1182
1183        BUG_ON(!is_data && refs_to_drop != 1);
1184        if (iref) {
1185                update_inline_extent_backref(path, iref, -refs_to_drop, NULL,
1186                                             last_ref);
1187        } else if (is_data) {
1188                ret = remove_extent_data_ref(trans, path, refs_to_drop,
1189                                             last_ref);
1190        } else {
1191                *last_ref = 1;
1192                ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
1193        }
1194        return ret;
1195}
1196
1197static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
1198                               u64 *discarded_bytes)
1199{
1200        int j, ret = 0;
1201        u64 bytes_left, end;
1202        u64 aligned_start = ALIGN(start, 1 << 9);
1203
1204        if (WARN_ON(start != aligned_start)) {
1205                len -= aligned_start - start;
1206                len = round_down(len, 1 << 9);
1207                start = aligned_start;
1208        }
1209
1210        *discarded_bytes = 0;
1211
1212        if (!len)
1213                return 0;
1214
1215        end = start + len;
1216        bytes_left = len;
1217
1218        /* Skip any superblocks on this device. */
1219        for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
1220                u64 sb_start = btrfs_sb_offset(j);
1221                u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
1222                u64 size = sb_start - start;
1223
1224                if (!in_range(sb_start, start, bytes_left) &&
1225                    !in_range(sb_end, start, bytes_left) &&
1226                    !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
1227                        continue;
1228
1229                /*
1230                 * Superblock spans beginning of range.  Adjust start and
1231                 * try again.
1232                 */
1233                if (sb_start <= start) {
1234                        start += sb_end - start;
1235                        if (start > end) {
1236                                bytes_left = 0;
1237                                break;
1238                        }
1239                        bytes_left = end - start;
1240                        continue;
1241                }
1242
1243                if (size) {
1244                        ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
1245                                                   GFP_NOFS, 0);
1246                        if (!ret)
1247                                *discarded_bytes += size;
1248                        else if (ret != -EOPNOTSUPP)
1249                                return ret;
1250                }
1251
1252                start = sb_end;
1253                if (start > end) {
1254                        bytes_left = 0;
1255                        break;
1256                }
1257                bytes_left = end - start;
1258        }
1259
1260        if (bytes_left) {
1261                ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
1262                                           GFP_NOFS, 0);
1263                if (!ret)
1264                        *discarded_bytes += bytes_left;
1265        }
1266        return ret;
1267}
1268
1269static int do_discard_extent(struct btrfs_io_stripe *stripe, u64 *bytes)
1270{
1271        struct btrfs_device *dev = stripe->dev;
1272        struct btrfs_fs_info *fs_info = dev->fs_info;
1273        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
1274        u64 phys = stripe->physical;
1275        u64 len = stripe->length;
1276        u64 discarded = 0;
1277        int ret = 0;
1278
1279        /* Zone reset on a zoned filesystem */
1280        if (btrfs_can_zone_reset(dev, phys, len)) {
1281                u64 src_disc;
1282
1283                ret = btrfs_reset_device_zone(dev, phys, len, &discarded);
1284                if (ret)
1285                        goto out;
1286
1287                if (!btrfs_dev_replace_is_ongoing(dev_replace) ||
1288                    dev != dev_replace->srcdev)
1289                        goto out;
1290
1291                src_disc = discarded;
1292
1293                /* Send to replace target as well */
1294                ret = btrfs_reset_device_zone(dev_replace->tgtdev, phys, len,
1295                                              &discarded);
1296                discarded += src_disc;
1297        } else if (blk_queue_discard(bdev_get_queue(stripe->dev->bdev))) {
1298                ret = btrfs_issue_discard(dev->bdev, phys, len, &discarded);
1299        } else {
1300                ret = 0;
1301                *bytes = 0;
1302        }
1303
1304out:
1305        *bytes = discarded;
1306        return ret;
1307}
1308
1309int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
1310                         u64 num_bytes, u64 *actual_bytes)
1311{
1312        int ret = 0;
1313        u64 discarded_bytes = 0;
1314        u64 end = bytenr + num_bytes;
1315        u64 cur = bytenr;
1316        struct btrfs_io_context *bioc = NULL;
1317
1318        /*
1319         * Avoid races with device replace and make sure our bioc has devices
1320         * associated to its stripes that don't go away while we are discarding.
1321         */
1322        btrfs_bio_counter_inc_blocked(fs_info);
1323        while (cur < end) {
1324                struct btrfs_io_stripe *stripe;
1325                int i;
1326
1327                num_bytes = end - cur;
1328                /* Tell the block device(s) that the sectors can be discarded */
1329                ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, cur,
1330                                      &num_bytes, &bioc, 0);
1331                /*
1332                 * Error can be -ENOMEM, -ENOENT (no such chunk mapping) or
1333                 * -EOPNOTSUPP. For any such error, @num_bytes is not updated,
1334                 * thus we can't continue anyway.
1335                 */
1336                if (ret < 0)
1337                        goto out;
1338
1339                stripe = bioc->stripes;
1340                for (i = 0; i < bioc->num_stripes; i++, stripe++) {
1341                        u64 bytes;
1342                        struct btrfs_device *device = stripe->dev;
1343
1344                        if (!device->bdev) {
1345                                ASSERT(btrfs_test_opt(fs_info, DEGRADED));
1346                                continue;
1347                        }
1348
1349                        if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
1350                                continue;
1351
1352                        ret = do_discard_extent(stripe, &bytes);
1353                        if (!ret) {
1354                                discarded_bytes += bytes;
1355                        } else if (ret != -EOPNOTSUPP) {
1356                                /*
1357                                 * Logic errors or -ENOMEM, or -EIO, but
1358                                 * unlikely to happen.
1359                                 *
1360                                 * And since there are two loops, explicitly
1361                                 * go to out to avoid confusion.
1362                                 */
1363                                btrfs_put_bioc(bioc);
1364                                goto out;
1365                        }
1366
1367                        /*
1368                         * Just in case we get back EOPNOTSUPP for some reason,
1369                         * just ignore the return value so we don't screw up
1370                         * people calling discard_extent.
1371                         */
1372                        ret = 0;
1373                }
1374                btrfs_put_bioc(bioc);
1375                cur += num_bytes;
1376        }
1377out:
1378        btrfs_bio_counter_dec(fs_info);
1379
1380        if (actual_bytes)
1381                *actual_bytes = discarded_bytes;
1382
1383
1384        if (ret == -EOPNOTSUPP)
1385                ret = 0;
1386        return ret;
1387}
1388
1389/* Can return -ENOMEM */
1390int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1391                         struct btrfs_ref *generic_ref)
1392{
1393        struct btrfs_fs_info *fs_info = trans->fs_info;
1394        int ret;
1395
1396        ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
1397               generic_ref->action);
1398        BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
1399               generic_ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID);
1400
1401        if (generic_ref->type == BTRFS_REF_METADATA)
1402                ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL);
1403        else
1404                ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0);
1405
1406        btrfs_ref_tree_mod(fs_info, generic_ref);
1407
1408        return ret;
1409}
1410
1411/*
1412 * __btrfs_inc_extent_ref - insert backreference for a given extent
1413 *
1414 * The counterpart is in __btrfs_free_extent(), with examples and more details
1415 * how it works.
1416 *
1417 * @trans:          Handle of transaction
1418 *
1419 * @node:           The delayed ref node used to get the bytenr/length for
1420 *                  extent whose references are incremented.
1421 *
1422 * @parent:         If this is a shared extent (BTRFS_SHARED_DATA_REF_KEY/
1423 *                  BTRFS_SHARED_BLOCK_REF_KEY) then it holds the logical
1424 *                  bytenr of the parent block. Since new extents are always
1425 *                  created with indirect references, this will only be the case
1426 *                  when relocating a shared extent. In that case, root_objectid
1427 *                  will be BTRFS_TREE_RELOC_OBJECTID. Otherwise, parent must
1428 *                  be 0
1429 *
1430 * @root_objectid:  The id of the root where this modification has originated,
1431 *                  this can be either one of the well-known metadata trees or
1432 *                  the subvolume id which references this extent.
1433 *
1434 * @owner:          For data extents it is the inode number of the owning file.
1435 *                  For metadata extents this parameter holds the level in the
1436 *                  tree of the extent.
1437 *
1438 * @offset:         For metadata extents the offset is ignored and is currently
1439 *                  always passed as 0. For data extents it is the fileoffset
1440 *                  this extent belongs to.
1441 *
1442 * @refs_to_add     Number of references to add
1443 *
1444 * @extent_op       Pointer to a structure, holding information necessary when
1445 *                  updating a tree block's flags
1446 *
1447 */
1448static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1449                                  struct btrfs_delayed_ref_node *node,
1450                                  u64 parent, u64 root_objectid,
1451                                  u64 owner, u64 offset, int refs_to_add,
1452                                  struct btrfs_delayed_extent_op *extent_op)
1453{
1454        struct btrfs_path *path;
1455        struct extent_buffer *leaf;
1456        struct btrfs_extent_item *item;
1457        struct btrfs_key key;
1458        u64 bytenr = node->bytenr;
1459        u64 num_bytes = node->num_bytes;
1460        u64 refs;
1461        int ret;
1462
1463        path = btrfs_alloc_path();
1464        if (!path)
1465                return -ENOMEM;
1466
1467        /* this will setup the path even if it fails to insert the back ref */
1468        ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
1469                                           parent, root_objectid, owner,
1470                                           offset, refs_to_add, extent_op);
1471        if ((ret < 0 && ret != -EAGAIN) || !ret)
1472                goto out;
1473
1474        /*
1475         * Ok we had -EAGAIN which means we didn't have space to insert and
1476         * inline extent ref, so just update the reference count and add a
1477         * normal backref.
1478         */
1479        leaf = path->nodes[0];
1480        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1481        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1482        refs = btrfs_extent_refs(leaf, item);
1483        btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
1484        if (extent_op)
1485                __run_delayed_extent_op(extent_op, leaf, item);
1486
1487        btrfs_mark_buffer_dirty(leaf);
1488        btrfs_release_path(path);
1489
1490        /* now insert the actual backref */
1491        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1492                BUG_ON(refs_to_add != 1);
1493                ret = insert_tree_block_ref(trans, path, bytenr, parent,
1494                                            root_objectid);
1495        } else {
1496                ret = insert_extent_data_ref(trans, path, bytenr, parent,
1497                                             root_objectid, owner, offset,
1498                                             refs_to_add);
1499        }
1500        if (ret)
1501                btrfs_abort_transaction(trans, ret);
1502out:
1503        btrfs_free_path(path);
1504        return ret;
1505}
1506
1507static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
1508                                struct btrfs_delayed_ref_node *node,
1509                                struct btrfs_delayed_extent_op *extent_op,
1510                                int insert_reserved)
1511{
1512        int ret = 0;
1513        struct btrfs_delayed_data_ref *ref;
1514        struct btrfs_key ins;
1515        u64 parent = 0;
1516        u64 ref_root = 0;
1517        u64 flags = 0;
1518
1519        ins.objectid = node->bytenr;
1520        ins.offset = node->num_bytes;
1521        ins.type = BTRFS_EXTENT_ITEM_KEY;
1522
1523        ref = btrfs_delayed_node_to_data_ref(node);
1524        trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action);
1525
1526        if (node->type == BTRFS_SHARED_DATA_REF_KEY)
1527                parent = ref->parent;
1528        ref_root = ref->root;
1529
1530        if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1531                if (extent_op)
1532                        flags |= extent_op->flags_to_set;
1533                ret = alloc_reserved_file_extent(trans, parent, ref_root,
1534                                                 flags, ref->objectid,
1535                                                 ref->offset, &ins,
1536                                                 node->ref_mod);
1537        } else if (node->action == BTRFS_ADD_DELAYED_REF) {
1538                ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
1539                                             ref->objectid, ref->offset,
1540                                             node->ref_mod, extent_op);
1541        } else if (node->action == BTRFS_DROP_DELAYED_REF) {
1542                ret = __btrfs_free_extent(trans, node, parent,
1543                                          ref_root, ref->objectid,
1544                                          ref->offset, node->ref_mod,
1545                                          extent_op);
1546        } else {
1547                BUG();
1548        }
1549        return ret;
1550}
1551
1552static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
1553                                    struct extent_buffer *leaf,
1554                                    struct btrfs_extent_item *ei)
1555{
1556        u64 flags = btrfs_extent_flags(leaf, ei);
1557        if (extent_op->update_flags) {
1558                flags |= extent_op->flags_to_set;
1559                btrfs_set_extent_flags(leaf, ei, flags);
1560        }
1561
1562        if (extent_op->update_key) {
1563                struct btrfs_tree_block_info *bi;
1564                BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
1565                bi = (struct btrfs_tree_block_info *)(ei + 1);
1566                btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
1567        }
1568}
1569
1570static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
1571                                 struct btrfs_delayed_ref_head *head,
1572                                 struct btrfs_delayed_extent_op *extent_op)
1573{
1574        struct btrfs_fs_info *fs_info = trans->fs_info;
1575        struct btrfs_key key;
1576        struct btrfs_path *path;
1577        struct btrfs_extent_item *ei;
1578        struct extent_buffer *leaf;
1579        u32 item_size;
1580        int ret;
1581        int err = 0;
1582        int metadata = !extent_op->is_data;
1583
1584        if (TRANS_ABORTED(trans))
1585                return 0;
1586
1587        if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA))
1588                metadata = 0;
1589
1590        path = btrfs_alloc_path();
1591        if (!path)
1592                return -ENOMEM;
1593
1594        key.objectid = head->bytenr;
1595
1596        if (metadata) {
1597                key.type = BTRFS_METADATA_ITEM_KEY;
1598                key.offset = extent_op->level;
1599        } else {
1600                key.type = BTRFS_EXTENT_ITEM_KEY;
1601                key.offset = head->num_bytes;
1602        }
1603
1604again:
1605        ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1);
1606        if (ret < 0) {
1607                err = ret;
1608                goto out;
1609        }
1610        if (ret > 0) {
1611                if (metadata) {
1612                        if (path->slots[0] > 0) {
1613                                path->slots[0]--;
1614                                btrfs_item_key_to_cpu(path->nodes[0], &key,
1615                                                      path->slots[0]);
1616                                if (key.objectid == head->bytenr &&
1617                                    key.type == BTRFS_EXTENT_ITEM_KEY &&
1618                                    key.offset == head->num_bytes)
1619                                        ret = 0;
1620                        }
1621                        if (ret > 0) {
1622                                btrfs_release_path(path);
1623                                metadata = 0;
1624
1625                                key.objectid = head->bytenr;
1626                                key.offset = head->num_bytes;
1627                                key.type = BTRFS_EXTENT_ITEM_KEY;
1628                                goto again;
1629                        }
1630                } else {
1631                        err = -EIO;
1632                        goto out;
1633                }
1634        }
1635
1636        leaf = path->nodes[0];
1637        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1638
1639        if (unlikely(item_size < sizeof(*ei))) {
1640                err = -EINVAL;
1641                btrfs_print_v0_err(fs_info);
1642                btrfs_abort_transaction(trans, err);
1643                goto out;
1644        }
1645
1646        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1647        __run_delayed_extent_op(extent_op, leaf, ei);
1648
1649        btrfs_mark_buffer_dirty(leaf);
1650out:
1651        btrfs_free_path(path);
1652        return err;
1653}
1654
1655static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
1656                                struct btrfs_delayed_ref_node *node,
1657                                struct btrfs_delayed_extent_op *extent_op,
1658                                int insert_reserved)
1659{
1660        int ret = 0;
1661        struct btrfs_delayed_tree_ref *ref;
1662        u64 parent = 0;
1663        u64 ref_root = 0;
1664
1665        ref = btrfs_delayed_node_to_tree_ref(node);
1666        trace_run_delayed_tree_ref(trans->fs_info, node, ref, node->action);
1667
1668        if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
1669                parent = ref->parent;
1670        ref_root = ref->root;
1671
1672        if (node->ref_mod != 1) {
1673                btrfs_err(trans->fs_info,
1674        "btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu",
1675                          node->bytenr, node->ref_mod, node->action, ref_root,
1676                          parent);
1677                return -EIO;
1678        }
1679        if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1680                BUG_ON(!extent_op || !extent_op->update_flags);
1681                ret = alloc_reserved_tree_block(trans, node, extent_op);
1682        } else if (node->action == BTRFS_ADD_DELAYED_REF) {
1683                ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
1684                                             ref->level, 0, 1, extent_op);
1685        } else if (node->action == BTRFS_DROP_DELAYED_REF) {
1686                ret = __btrfs_free_extent(trans, node, parent, ref_root,
1687                                          ref->level, 0, 1, extent_op);
1688        } else {
1689                BUG();
1690        }
1691        return ret;
1692}
1693
1694/* helper function to actually process a single delayed ref entry */
1695static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
1696                               struct btrfs_delayed_ref_node *node,
1697                               struct btrfs_delayed_extent_op *extent_op,
1698                               int insert_reserved)
1699{
1700        int ret = 0;
1701
1702        if (TRANS_ABORTED(trans)) {
1703                if (insert_reserved)
1704                        btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
1705                return 0;
1706        }
1707
1708        if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
1709            node->type == BTRFS_SHARED_BLOCK_REF_KEY)
1710                ret = run_delayed_tree_ref(trans, node, extent_op,
1711                                           insert_reserved);
1712        else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
1713                 node->type == BTRFS_SHARED_DATA_REF_KEY)
1714                ret = run_delayed_data_ref(trans, node, extent_op,
1715                                           insert_reserved);
1716        else
1717                BUG();
1718        if (ret && insert_reserved)
1719                btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
1720        return ret;
1721}
1722
1723static inline struct btrfs_delayed_ref_node *
1724select_delayed_ref(struct btrfs_delayed_ref_head *head)
1725{
1726        struct btrfs_delayed_ref_node *ref;
1727
1728        if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
1729                return NULL;
1730
1731        /*
1732         * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
1733         * This is to prevent a ref count from going down to zero, which deletes
1734         * the extent item from the extent tree, when there still are references
1735         * to add, which would fail because they would not find the extent item.
1736         */
1737        if (!list_empty(&head->ref_add_list))
1738                return list_first_entry(&head->ref_add_list,
1739                                struct btrfs_delayed_ref_node, add_list);
1740
1741        ref = rb_entry(rb_first_cached(&head->ref_tree),
1742                       struct btrfs_delayed_ref_node, ref_node);
1743        ASSERT(list_empty(&ref->add_list));
1744        return ref;
1745}
1746
1747static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
1748                                      struct btrfs_delayed_ref_head *head)
1749{
1750        spin_lock(&delayed_refs->lock);
1751        head->processing = 0;
1752        delayed_refs->num_heads_ready++;
1753        spin_unlock(&delayed_refs->lock);
1754        btrfs_delayed_ref_unlock(head);
1755}
1756
1757static struct btrfs_delayed_extent_op *cleanup_extent_op(
1758                                struct btrfs_delayed_ref_head *head)
1759{
1760        struct btrfs_delayed_extent_op *extent_op = head->extent_op;
1761
1762        if (!extent_op)
1763                return NULL;
1764
1765        if (head->must_insert_reserved) {
1766                head->extent_op = NULL;
1767                btrfs_free_delayed_extent_op(extent_op);
1768                return NULL;
1769        }
1770        return extent_op;
1771}
1772
1773static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans,
1774                                     struct btrfs_delayed_ref_head *head)
1775{
1776        struct btrfs_delayed_extent_op *extent_op;
1777        int ret;
1778
1779        extent_op = cleanup_extent_op(head);
1780        if (!extent_op)
1781                return 0;
1782        head->extent_op = NULL;
1783        spin_unlock(&head->lock);
1784        ret = run_delayed_extent_op(trans, head, extent_op);
1785        btrfs_free_delayed_extent_op(extent_op);
1786        return ret ? ret : 1;
1787}
1788
1789void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
1790                                  struct btrfs_delayed_ref_root *delayed_refs,
1791                                  struct btrfs_delayed_ref_head *head)
1792{
1793        int nr_items = 1;       /* Dropping this ref head update. */
1794
1795        /*
1796         * We had csum deletions accounted for in our delayed refs rsv, we need
1797         * to drop the csum leaves for this update from our delayed_refs_rsv.
1798         */
1799        if (head->total_ref_mod < 0 && head->is_data) {
1800                spin_lock(&delayed_refs->lock);
1801                delayed_refs->pending_csums -= head->num_bytes;
1802                spin_unlock(&delayed_refs->lock);
1803                nr_items += btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes);
1804        }
1805
1806        btrfs_delayed_refs_rsv_release(fs_info, nr_items);
1807}
1808
1809static int cleanup_ref_head(struct btrfs_trans_handle *trans,
1810                            struct btrfs_delayed_ref_head *head)
1811{
1812
1813        struct btrfs_fs_info *fs_info = trans->fs_info;
1814        struct btrfs_delayed_ref_root *delayed_refs;
1815        int ret;
1816
1817        delayed_refs = &trans->transaction->delayed_refs;
1818
1819        ret = run_and_cleanup_extent_op(trans, head);
1820        if (ret < 0) {
1821                unselect_delayed_ref_head(delayed_refs, head);
1822                btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
1823                return ret;
1824        } else if (ret) {
1825                return ret;
1826        }
1827
1828        /*
1829         * Need to drop our head ref lock and re-acquire the delayed ref lock
1830         * and then re-check to make sure nobody got added.
1831         */
1832        spin_unlock(&head->lock);
1833        spin_lock(&delayed_refs->lock);
1834        spin_lock(&head->lock);
1835        if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) {
1836                spin_unlock(&head->lock);
1837                spin_unlock(&delayed_refs->lock);
1838                return 1;
1839        }
1840        btrfs_delete_ref_head(delayed_refs, head);
1841        spin_unlock(&head->lock);
1842        spin_unlock(&delayed_refs->lock);
1843
1844        if (head->must_insert_reserved) {
1845                btrfs_pin_extent(trans, head->bytenr, head->num_bytes, 1);
1846                if (head->is_data) {
1847                        ret = btrfs_del_csums(trans, fs_info->csum_root,
1848                                              head->bytenr, head->num_bytes);
1849                }
1850        }
1851
1852        btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
1853
1854        trace_run_delayed_ref_head(fs_info, head, 0);
1855        btrfs_delayed_ref_unlock(head);
1856        btrfs_put_delayed_ref_head(head);
1857        return ret;
1858}
1859
1860static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
1861                                        struct btrfs_trans_handle *trans)
1862{
1863        struct btrfs_delayed_ref_root *delayed_refs =
1864                &trans->transaction->delayed_refs;
1865        struct btrfs_delayed_ref_head *head = NULL;
1866        int ret;
1867
1868        spin_lock(&delayed_refs->lock);
1869        head = btrfs_select_ref_head(delayed_refs);
1870        if (!head) {
1871                spin_unlock(&delayed_refs->lock);
1872                return head;
1873        }
1874
1875        /*
1876         * Grab the lock that says we are going to process all the refs for
1877         * this head
1878         */
1879        ret = btrfs_delayed_ref_lock(delayed_refs, head);
1880        spin_unlock(&delayed_refs->lock);
1881
1882        /*
1883         * We may have dropped the spin lock to get the head mutex lock, and
1884         * that might have given someone else time to free the head.  If that's
1885         * true, it has been removed from our list and we can move on.
1886         */
1887        if (ret == -EAGAIN)
1888                head = ERR_PTR(-EAGAIN);
1889
1890        return head;
1891}
1892
1893static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
1894                                    struct btrfs_delayed_ref_head *locked_ref,
1895                                    unsigned long *run_refs)
1896{
1897        struct btrfs_fs_info *fs_info = trans->fs_info;
1898        struct btrfs_delayed_ref_root *delayed_refs;
1899        struct btrfs_delayed_extent_op *extent_op;
1900        struct btrfs_delayed_ref_node *ref;
1901        int must_insert_reserved = 0;
1902        int ret;
1903
1904        delayed_refs = &trans->transaction->delayed_refs;
1905
1906        lockdep_assert_held(&locked_ref->mutex);
1907        lockdep_assert_held(&locked_ref->lock);
1908
1909        while ((ref = select_delayed_ref(locked_ref))) {
1910                if (ref->seq &&
1911                    btrfs_check_delayed_seq(fs_info, ref->seq)) {
1912                        spin_unlock(&locked_ref->lock);
1913                        unselect_delayed_ref_head(delayed_refs, locked_ref);
1914                        return -EAGAIN;
1915                }
1916
1917                (*run_refs)++;
1918                ref->in_tree = 0;
1919                rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
1920                RB_CLEAR_NODE(&ref->ref_node);
1921                if (!list_empty(&ref->add_list))
1922                        list_del(&ref->add_list);
1923                /*
1924                 * When we play the delayed ref, also correct the ref_mod on
1925                 * head
1926                 */
1927                switch (ref->action) {
1928                case BTRFS_ADD_DELAYED_REF:
1929                case BTRFS_ADD_DELAYED_EXTENT:
1930                        locked_ref->ref_mod -= ref->ref_mod;
1931                        break;
1932                case BTRFS_DROP_DELAYED_REF:
1933                        locked_ref->ref_mod += ref->ref_mod;
1934                        break;
1935                default:
1936                        WARN_ON(1);
1937                }
1938                atomic_dec(&delayed_refs->num_entries);
1939
1940                /*
1941                 * Record the must_insert_reserved flag before we drop the
1942                 * spin lock.
1943                 */
1944                must_insert_reserved = locked_ref->must_insert_reserved;
1945                locked_ref->must_insert_reserved = 0;
1946
1947                extent_op = locked_ref->extent_op;
1948                locked_ref->extent_op = NULL;
1949                spin_unlock(&locked_ref->lock);
1950
1951                ret = run_one_delayed_ref(trans, ref, extent_op,
1952                                          must_insert_reserved);
1953
1954                btrfs_free_delayed_extent_op(extent_op);
1955                if (ret) {
1956                        unselect_delayed_ref_head(delayed_refs, locked_ref);
1957                        btrfs_put_delayed_ref(ref);
1958                        btrfs_debug(fs_info, "run_one_delayed_ref returned %d",
1959                                    ret);
1960                        return ret;
1961                }
1962
1963                btrfs_put_delayed_ref(ref);
1964                cond_resched();
1965
1966                spin_lock(&locked_ref->lock);
1967                btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
1968        }
1969
1970        return 0;
1971}
1972
1973/*
1974 * Returns 0 on success or if called with an already aborted transaction.
1975 * Returns -ENOMEM or -EIO on failure and will abort the transaction.
1976 */
1977static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
1978                                             unsigned long nr)
1979{
1980        struct btrfs_fs_info *fs_info = trans->fs_info;
1981        struct btrfs_delayed_ref_root *delayed_refs;
1982        struct btrfs_delayed_ref_head *locked_ref = NULL;
1983        ktime_t start = ktime_get();
1984        int ret;
1985        unsigned long count = 0;
1986        unsigned long actual_count = 0;
1987
1988        delayed_refs = &trans->transaction->delayed_refs;
1989        do {
1990                if (!locked_ref) {
1991                        locked_ref = btrfs_obtain_ref_head(trans);
1992                        if (IS_ERR_OR_NULL(locked_ref)) {
1993                                if (PTR_ERR(locked_ref) == -EAGAIN) {
1994                                        continue;
1995                                } else {
1996                                        break;
1997                                }
1998                        }
1999                        count++;
2000                }
2001                /*
2002                 * We need to try and merge add/drops of the same ref since we
2003                 * can run into issues with relocate dropping the implicit ref
2004                 * and then it being added back again before the drop can
2005                 * finish.  If we merged anything we need to re-loop so we can
2006                 * get a good ref.
2007                 * Or we can get node references of the same type that weren't
2008                 * merged when created due to bumps in the tree mod seq, and
2009                 * we need to merge them to prevent adding an inline extent
2010                 * backref before dropping it (triggering a BUG_ON at
2011                 * insert_inline_extent_backref()).
2012                 */
2013                spin_lock(&locked_ref->lock);
2014                btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
2015
2016                ret = btrfs_run_delayed_refs_for_head(trans, locked_ref,
2017                                                      &actual_count);
2018                if (ret < 0 && ret != -EAGAIN) {
2019                        /*
2020                         * Error, btrfs_run_delayed_refs_for_head already
2021                         * unlocked everything so just bail out
2022                         */
2023                        return ret;
2024                } else if (!ret) {
2025                        /*
2026                         * Success, perform the usual cleanup of a processed
2027                         * head
2028                         */
2029                        ret = cleanup_ref_head(trans, locked_ref);
2030                        if (ret > 0 ) {
2031                                /* We dropped our lock, we need to loop. */
2032                                ret = 0;
2033                                continue;
2034                        } else if (ret) {
2035                                return ret;
2036                        }
2037                }
2038
2039                /*
2040                 * Either success case or btrfs_run_delayed_refs_for_head
2041                 * returned -EAGAIN, meaning we need to select another head
2042                 */
2043
2044                locked_ref = NULL;
2045                cond_resched();
2046        } while ((nr != -1 && count < nr) || locked_ref);
2047
2048        /*
2049         * We don't want to include ref heads since we can have empty ref heads
2050         * and those will drastically skew our runtime down since we just do
2051         * accounting, no actual extent tree updates.
2052         */
2053        if (actual_count > 0) {
2054                u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
2055                u64 avg;
2056
2057                /*
2058                 * We weigh the current average higher than our current runtime
2059                 * to avoid large swings in the average.
2060                 */
2061                spin_lock(&delayed_refs->lock);
2062                avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
2063                fs_info->avg_delayed_ref_runtime = avg >> 2;    /* div by 4 */
2064                spin_unlock(&delayed_refs->lock);
2065        }
2066        return 0;
2067}
2068
2069#ifdef SCRAMBLE_DELAYED_REFS
2070/*
2071 * Normally delayed refs get processed in ascending bytenr order. This
2072 * correlates in most cases to the order added. To expose dependencies on this
2073 * order, we start to process the tree in the middle instead of the beginning
2074 */
2075static u64 find_middle(struct rb_root *root)
2076{
2077        struct rb_node *n = root->rb_node;
2078        struct btrfs_delayed_ref_node *entry;
2079        int alt = 1;
2080        u64 middle;
2081        u64 first = 0, last = 0;
2082
2083        n = rb_first(root);
2084        if (n) {
2085                entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2086                first = entry->bytenr;
2087        }
2088        n = rb_last(root);
2089        if (n) {
2090                entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2091                last = entry->bytenr;
2092        }
2093        n = root->rb_node;
2094
2095        while (n) {
2096                entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2097                WARN_ON(!entry->in_tree);
2098
2099                middle = entry->bytenr;
2100
2101                if (alt)
2102                        n = n->rb_left;
2103                else
2104                        n = n->rb_right;
2105
2106                alt = 1 - alt;
2107        }
2108        return middle;
2109}
2110#endif
2111
2112/*
2113 * this starts processing the delayed reference count updates and
2114 * extent insertions we have queued up so far.  count can be
2115 * 0, which means to process everything in the tree at the start
2116 * of the run (but not newly added entries), or it can be some target
2117 * number you'd like to process.
2118 *
2119 * Returns 0 on success or if called with an aborted transaction
2120 * Returns <0 on error and aborts the transaction
2121 */
2122int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2123                           unsigned long count)
2124{
2125        struct btrfs_fs_info *fs_info = trans->fs_info;
2126        struct rb_node *node;
2127        struct btrfs_delayed_ref_root *delayed_refs;
2128        struct btrfs_delayed_ref_head *head;
2129        int ret;
2130        int run_all = count == (unsigned long)-1;
2131
2132        /* We'll clean this up in btrfs_cleanup_transaction */
2133        if (TRANS_ABORTED(trans))
2134                return 0;
2135
2136        if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
2137                return 0;
2138
2139        delayed_refs = &trans->transaction->delayed_refs;
2140        if (count == 0)
2141                count = delayed_refs->num_heads_ready;
2142
2143again:
2144#ifdef SCRAMBLE_DELAYED_REFS
2145        delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2146#endif
2147        ret = __btrfs_run_delayed_refs(trans, count);
2148        if (ret < 0) {
2149                btrfs_abort_transaction(trans, ret);
2150                return ret;
2151        }
2152
2153        if (run_all) {
2154                btrfs_create_pending_block_groups(trans);
2155
2156                spin_lock(&delayed_refs->lock);
2157                node = rb_first_cached(&delayed_refs->href_root);
2158                if (!node) {
2159                        spin_unlock(&delayed_refs->lock);
2160                        goto out;
2161                }
2162                head = rb_entry(node, struct btrfs_delayed_ref_head,
2163                                href_node);
2164                refcount_inc(&head->refs);
2165                spin_unlock(&delayed_refs->lock);
2166
2167                /* Mutex was contended, block until it's released and retry. */
2168                mutex_lock(&head->mutex);
2169                mutex_unlock(&head->mutex);
2170
2171                btrfs_put_delayed_ref_head(head);
2172                cond_resched();
2173                goto again;
2174        }
2175out:
2176        return 0;
2177}
2178
2179int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2180                                struct extent_buffer *eb, u64 flags,
2181                                int level, int is_data)
2182{
2183        struct btrfs_delayed_extent_op *extent_op;
2184        int ret;
2185
2186        extent_op = btrfs_alloc_delayed_extent_op();
2187        if (!extent_op)
2188                return -ENOMEM;
2189
2190        extent_op->flags_to_set = flags;
2191        extent_op->update_flags = true;
2192        extent_op->update_key = false;
2193        extent_op->is_data = is_data ? true : false;
2194        extent_op->level = level;
2195
2196        ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, extent_op);
2197        if (ret)
2198                btrfs_free_delayed_extent_op(extent_op);
2199        return ret;
2200}
2201
2202static noinline int check_delayed_ref(struct btrfs_root *root,
2203                                      struct btrfs_path *path,
2204                                      u64 objectid, u64 offset, u64 bytenr)
2205{
2206        struct btrfs_delayed_ref_head *head;
2207        struct btrfs_delayed_ref_node *ref;
2208        struct btrfs_delayed_data_ref *data_ref;
2209        struct btrfs_delayed_ref_root *delayed_refs;
2210        struct btrfs_transaction *cur_trans;
2211        struct rb_node *node;
2212        int ret = 0;
2213
2214        spin_lock(&root->fs_info->trans_lock);
2215        cur_trans = root->fs_info->running_transaction;
2216        if (cur_trans)
2217                refcount_inc(&cur_trans->use_count);
2218        spin_unlock(&root->fs_info->trans_lock);
2219        if (!cur_trans)
2220                return 0;
2221
2222        delayed_refs = &cur_trans->delayed_refs;
2223        spin_lock(&delayed_refs->lock);
2224        head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
2225        if (!head) {
2226                spin_unlock(&delayed_refs->lock);
2227                btrfs_put_transaction(cur_trans);
2228                return 0;
2229        }
2230
2231        if (!mutex_trylock(&head->mutex)) {
2232                refcount_inc(&head->refs);
2233                spin_unlock(&delayed_refs->lock);
2234
2235                btrfs_release_path(path);
2236
2237                /*
2238                 * Mutex was contended, block until it's released and let
2239                 * caller try again
2240                 */
2241                mutex_lock(&head->mutex);
2242                mutex_unlock(&head->mutex);
2243                btrfs_put_delayed_ref_head(head);
2244                btrfs_put_transaction(cur_trans);
2245                return -EAGAIN;
2246        }
2247        spin_unlock(&delayed_refs->lock);
2248
2249        spin_lock(&head->lock);
2250        /*
2251         * XXX: We should replace this with a proper search function in the
2252         * future.
2253         */
2254        for (node = rb_first_cached(&head->ref_tree); node;
2255             node = rb_next(node)) {
2256                ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
2257                /* If it's a shared ref we know a cross reference exists */
2258                if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
2259                        ret = 1;
2260                        break;
2261                }
2262
2263                data_ref = btrfs_delayed_node_to_data_ref(ref);
2264
2265                /*
2266                 * If our ref doesn't match the one we're currently looking at
2267                 * then we have a cross reference.
2268                 */
2269                if (data_ref->root != root->root_key.objectid ||
2270                    data_ref->objectid != objectid ||
2271                    data_ref->offset != offset) {
2272                        ret = 1;
2273                        break;
2274                }
2275        }
2276        spin_unlock(&head->lock);
2277        mutex_unlock(&head->mutex);
2278        btrfs_put_transaction(cur_trans);
2279        return ret;
2280}
2281
2282static noinline int check_committed_ref(struct btrfs_root *root,
2283                                        struct btrfs_path *path,
2284                                        u64 objectid, u64 offset, u64 bytenr,
2285                                        bool strict)
2286{
2287        struct btrfs_fs_info *fs_info = root->fs_info;
2288        struct btrfs_root *extent_root = fs_info->extent_root;
2289        struct extent_buffer *leaf;
2290        struct btrfs_extent_data_ref *ref;
2291        struct btrfs_extent_inline_ref *iref;
2292        struct btrfs_extent_item *ei;
2293        struct btrfs_key key;
2294        u32 item_size;
2295        int type;
2296        int ret;
2297
2298        key.objectid = bytenr;
2299        key.offset = (u64)-1;
2300        key.type = BTRFS_EXTENT_ITEM_KEY;
2301
2302        ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2303        if (ret < 0)
2304                goto out;
2305        BUG_ON(ret == 0); /* Corruption */
2306
2307        ret = -ENOENT;
2308        if (path->slots[0] == 0)
2309                goto out;
2310
2311        path->slots[0]--;
2312        leaf = path->nodes[0];
2313        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2314
2315        if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
2316                goto out;
2317
2318        ret = 1;
2319        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2320        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2321
2322        /* If extent item has more than 1 inline ref then it's shared */
2323        if (item_size != sizeof(*ei) +
2324            btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
2325                goto out;
2326
2327        /*
2328         * If extent created before last snapshot => it's shared unless the
2329         * snapshot has been deleted. Use the heuristic if strict is false.
2330         */
2331        if (!strict &&
2332            (btrfs_extent_generation(leaf, ei) <=
2333             btrfs_root_last_snapshot(&root->root_item)))
2334                goto out;
2335
2336        iref = (struct btrfs_extent_inline_ref *)(ei + 1);
2337
2338        /* If this extent has SHARED_DATA_REF then it's shared */
2339        type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
2340        if (type != BTRFS_EXTENT_DATA_REF_KEY)
2341                goto out;
2342
2343        ref = (struct btrfs_extent_data_ref *)(&iref->offset);
2344        if (btrfs_extent_refs(leaf, ei) !=
2345            btrfs_extent_data_ref_count(leaf, ref) ||
2346            btrfs_extent_data_ref_root(leaf, ref) !=
2347            root->root_key.objectid ||
2348            btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
2349            btrfs_extent_data_ref_offset(leaf, ref) != offset)
2350                goto out;
2351
2352        ret = 0;
2353out:
2354        return ret;
2355}
2356
2357int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
2358                          u64 bytenr, bool strict)
2359{
2360        struct btrfs_path *path;
2361        int ret;
2362
2363        path = btrfs_alloc_path();
2364        if (!path)
2365                return -ENOMEM;
2366
2367        do {
2368                ret = check_committed_ref(root, path, objectid,
2369                                          offset, bytenr, strict);
2370                if (ret && ret != -ENOENT)
2371                        goto out;
2372
2373                ret = check_delayed_ref(root, path, objectid, offset, bytenr);
2374        } while (ret == -EAGAIN);
2375
2376out:
2377        btrfs_free_path(path);
2378        if (btrfs_is_data_reloc_root(root))
2379                WARN_ON(ret > 0);
2380        return ret;
2381}
2382
2383static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2384                           struct btrfs_root *root,
2385                           struct extent_buffer *buf,
2386                           int full_backref, int inc)
2387{
2388        struct btrfs_fs_info *fs_info = root->fs_info;
2389        u64 bytenr;
2390        u64 num_bytes;
2391        u64 parent;
2392        u64 ref_root;
2393        u32 nritems;
2394        struct btrfs_key key;
2395        struct btrfs_file_extent_item *fi;
2396        struct btrfs_ref generic_ref = { 0 };
2397        bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC);
2398        int i;
2399        int action;
2400        int level;
2401        int ret = 0;
2402
2403        if (btrfs_is_testing(fs_info))
2404                return 0;
2405
2406        ref_root = btrfs_header_owner(buf);
2407        nritems = btrfs_header_nritems(buf);
2408        level = btrfs_header_level(buf);
2409
2410        if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && level == 0)
2411                return 0;
2412
2413        if (full_backref)
2414                parent = buf->start;
2415        else
2416                parent = 0;
2417        if (inc)
2418                action = BTRFS_ADD_DELAYED_REF;
2419        else
2420                action = BTRFS_DROP_DELAYED_REF;
2421
2422        for (i = 0; i < nritems; i++) {
2423                if (level == 0) {
2424                        btrfs_item_key_to_cpu(buf, &key, i);
2425                        if (key.type != BTRFS_EXTENT_DATA_KEY)
2426                                continue;
2427                        fi = btrfs_item_ptr(buf, i,
2428                                            struct btrfs_file_extent_item);
2429                        if (btrfs_file_extent_type(buf, fi) ==
2430                            BTRFS_FILE_EXTENT_INLINE)
2431                                continue;
2432                        bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
2433                        if (bytenr == 0)
2434                                continue;
2435
2436                        num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
2437                        key.offset -= btrfs_file_extent_offset(buf, fi);
2438                        btrfs_init_generic_ref(&generic_ref, action, bytenr,
2439                                               num_bytes, parent);
2440                        btrfs_init_data_ref(&generic_ref, ref_root, key.objectid,
2441                                            key.offset, root->root_key.objectid,
2442                                            for_reloc);
2443                        if (inc)
2444                                ret = btrfs_inc_extent_ref(trans, &generic_ref);
2445                        else
2446                                ret = btrfs_free_extent(trans, &generic_ref);
2447                        if (ret)
2448                                goto fail;
2449                } else {
2450                        bytenr = btrfs_node_blockptr(buf, i);
2451                        num_bytes = fs_info->nodesize;
2452                        btrfs_init_generic_ref(&generic_ref, action, bytenr,
2453                                               num_bytes, parent);
2454                        btrfs_init_tree_ref(&generic_ref, level - 1, ref_root,
2455                                            root->root_key.objectid, for_reloc);
2456                        if (inc)
2457                                ret = btrfs_inc_extent_ref(trans, &generic_ref);
2458                        else
2459                                ret = btrfs_free_extent(trans, &generic_ref);
2460                        if (ret)
2461                                goto fail;
2462                }
2463        }
2464        return 0;
2465fail:
2466        return ret;
2467}
2468
2469int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2470                  struct extent_buffer *buf, int full_backref)
2471{
2472        return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
2473}
2474
2475int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2476                  struct extent_buffer *buf, int full_backref)
2477{
2478        return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
2479}
2480
2481static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
2482{
2483        struct btrfs_fs_info *fs_info = root->fs_info;
2484        u64 flags;
2485        u64 ret;
2486
2487        if (data)
2488                flags = BTRFS_BLOCK_GROUP_DATA;
2489        else if (root == fs_info->chunk_root)
2490                flags = BTRFS_BLOCK_GROUP_SYSTEM;
2491        else
2492                flags = BTRFS_BLOCK_GROUP_METADATA;
2493
2494        ret = btrfs_get_alloc_profile(fs_info, flags);
2495        return ret;
2496}
2497
2498static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
2499{
2500        struct btrfs_block_group *cache;
2501        u64 bytenr;
2502
2503        spin_lock(&fs_info->block_group_cache_lock);
2504        bytenr = fs_info->first_logical_byte;
2505        spin_unlock(&fs_info->block_group_cache_lock);
2506
2507        if (bytenr < (u64)-1)
2508                return bytenr;
2509
2510        cache = btrfs_lookup_first_block_group(fs_info, search_start);
2511        if (!cache)
2512                return 0;
2513
2514        bytenr = cache->start;
2515        btrfs_put_block_group(cache);
2516
2517        return bytenr;
2518}
2519
2520static int pin_down_extent(struct btrfs_trans_handle *trans,
2521                           struct btrfs_block_group *cache,
2522                           u64 bytenr, u64 num_bytes, int reserved)
2523{
2524        struct btrfs_fs_info *fs_info = cache->fs_info;
2525
2526        spin_lock(&cache->space_info->lock);
2527        spin_lock(&cache->lock);
2528        cache->pinned += num_bytes;
2529        btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info,
2530                                             num_bytes);
2531        if (reserved) {
2532                cache->reserved -= num_bytes;
2533                cache->space_info->bytes_reserved -= num_bytes;
2534        }
2535        spin_unlock(&cache->lock);
2536        spin_unlock(&cache->space_info->lock);
2537
2538        set_extent_dirty(&trans->transaction->pinned_extents, bytenr,
2539                         bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
2540        return 0;
2541}
2542
2543int btrfs_pin_extent(struct btrfs_trans_handle *trans,
2544                     u64 bytenr, u64 num_bytes, int reserved)
2545{
2546        struct btrfs_block_group *cache;
2547
2548        cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
2549        BUG_ON(!cache); /* Logic error */
2550
2551        pin_down_extent(trans, cache, bytenr, num_bytes, reserved);
2552
2553        btrfs_put_block_group(cache);
2554        return 0;
2555}
2556
2557/*
2558 * this function must be called within transaction
2559 */
2560int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
2561                                    u64 bytenr, u64 num_bytes)
2562{
2563        struct btrfs_block_group *cache;
2564        int ret;
2565
2566        cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
2567        if (!cache)
2568                return -EINVAL;
2569
2570        /*
2571         * pull in the free space cache (if any) so that our pin
2572         * removes the free space from the cache.  We have load_only set
2573         * to one because the slow code to read in the free extents does check
2574         * the pinned extents.
2575         */
2576        btrfs_cache_block_group(cache, 1);
2577        /*
2578         * Make sure we wait until the cache is completely built in case it is
2579         * missing or is invalid and therefore needs to be rebuilt.
2580         */
2581        ret = btrfs_wait_block_group_cache_done(cache);
2582        if (ret)
2583                goto out;
2584
2585        pin_down_extent(trans, cache, bytenr, num_bytes, 0);
2586
2587        /* remove us from the free space cache (if we're there at all) */
2588        ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
2589out:
2590        btrfs_put_block_group(cache);
2591        return ret;
2592}
2593
2594static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
2595                                   u64 start, u64 num_bytes)
2596{
2597        int ret;
2598        struct btrfs_block_group *block_group;
2599
2600        block_group = btrfs_lookup_block_group(fs_info, start);
2601        if (!block_group)
2602                return -EINVAL;
2603
2604        btrfs_cache_block_group(block_group, 1);
2605        /*
2606         * Make sure we wait until the cache is completely built in case it is
2607         * missing or is invalid and therefore needs to be rebuilt.
2608         */
2609        ret = btrfs_wait_block_group_cache_done(block_group);
2610        if (ret)
2611                goto out;
2612
2613        ret = btrfs_remove_free_space(block_group, start, num_bytes);
2614out:
2615        btrfs_put_block_group(block_group);
2616        return ret;
2617}
2618
2619int btrfs_exclude_logged_extents(struct extent_buffer *eb)
2620{
2621        struct btrfs_fs_info *fs_info = eb->fs_info;
2622        struct btrfs_file_extent_item *item;
2623        struct btrfs_key key;
2624        int found_type;
2625        int i;
2626        int ret = 0;
2627
2628        if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS))
2629                return 0;
2630
2631        for (i = 0; i < btrfs_header_nritems(eb); i++) {
2632                btrfs_item_key_to_cpu(eb, &key, i);
2633                if (key.type != BTRFS_EXTENT_DATA_KEY)
2634                        continue;
2635                item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
2636                found_type = btrfs_file_extent_type(eb, item);
2637                if (found_type == BTRFS_FILE_EXTENT_INLINE)
2638                        continue;
2639                if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
2640                        continue;
2641                key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
2642                key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
2643                ret = __exclude_logged_extent(fs_info, key.objectid, key.offset);
2644                if (ret)
2645                        break;
2646        }
2647
2648        return ret;
2649}
2650
2651static void
2652btrfs_inc_block_group_reservations(struct btrfs_block_group *bg)
2653{
2654        atomic_inc(&bg->reservations);
2655}
2656
2657/*
2658 * Returns the free cluster for the given space info and sets empty_cluster to
2659 * what it should be based on the mount options.
2660 */
2661static struct btrfs_free_cluster *
2662fetch_cluster_info(struct btrfs_fs_info *fs_info,
2663                   struct btrfs_space_info *space_info, u64 *empty_cluster)
2664{
2665        struct btrfs_free_cluster *ret = NULL;
2666
2667        *empty_cluster = 0;
2668        if (btrfs_mixed_space_info(space_info))
2669                return ret;
2670
2671        if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
2672                ret = &fs_info->meta_alloc_cluster;
2673                if (btrfs_test_opt(fs_info, SSD))
2674                        *empty_cluster = SZ_2M;
2675                else
2676                        *empty_cluster = SZ_64K;
2677        } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) &&
2678                   btrfs_test_opt(fs_info, SSD_SPREAD)) {
2679                *empty_cluster = SZ_2M;
2680                ret = &fs_info->data_alloc_cluster;
2681        }
2682
2683        return ret;
2684}
2685
2686static int unpin_extent_range(struct btrfs_fs_info *fs_info,
2687                              u64 start, u64 end,
2688                              const bool return_free_space)
2689{
2690        struct btrfs_block_group *cache = NULL;
2691        struct btrfs_space_info *space_info;
2692        struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
2693        struct btrfs_free_cluster *cluster = NULL;
2694        u64 len;
2695        u64 total_unpinned = 0;
2696        u64 empty_cluster = 0;
2697        bool readonly;
2698
2699        while (start <= end) {
2700                readonly = false;
2701                if (!cache ||
2702                    start >= cache->start + cache->length) {
2703                        if (cache)
2704                                btrfs_put_block_group(cache);
2705                        total_unpinned = 0;
2706                        cache = btrfs_lookup_block_group(fs_info, start);
2707                        BUG_ON(!cache); /* Logic error */
2708
2709                        cluster = fetch_cluster_info(fs_info,
2710                                                     cache->space_info,
2711                                                     &empty_cluster);
2712                        empty_cluster <<= 1;
2713                }
2714
2715                len = cache->start + cache->length - start;
2716                len = min(len, end + 1 - start);
2717
2718                down_read(&fs_info->commit_root_sem);
2719                if (start < cache->last_byte_to_unpin && return_free_space) {
2720                        u64 add_len = min(len, cache->last_byte_to_unpin - start);
2721
2722                        btrfs_add_free_space(cache, start, add_len);
2723                }
2724                up_read(&fs_info->commit_root_sem);
2725
2726                start += len;
2727                total_unpinned += len;
2728                space_info = cache->space_info;
2729
2730                /*
2731                 * If this space cluster has been marked as fragmented and we've
2732                 * unpinned enough in this block group to potentially allow a
2733                 * cluster to be created inside of it go ahead and clear the
2734                 * fragmented check.
2735                 */
2736                if (cluster && cluster->fragmented &&
2737                    total_unpinned > empty_cluster) {
2738                        spin_lock(&cluster->lock);
2739                        cluster->fragmented = 0;
2740                        spin_unlock(&cluster->lock);
2741                }
2742
2743                spin_lock(&space_info->lock);
2744                spin_lock(&cache->lock);
2745                cache->pinned -= len;
2746                btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len);
2747                space_info->max_extent_size = 0;
2748                if (cache->ro) {
2749                        space_info->bytes_readonly += len;
2750                        readonly = true;
2751                } else if (btrfs_is_zoned(fs_info)) {
2752                        /* Need reset before reusing in a zoned block group */
2753                        space_info->bytes_zone_unusable += len;
2754                        readonly = true;
2755                }
2756                spin_unlock(&cache->lock);
2757                if (!readonly && return_free_space &&
2758                    global_rsv->space_info == space_info) {
2759                        u64 to_add = len;
2760
2761                        spin_lock(&global_rsv->lock);
2762                        if (!global_rsv->full) {
2763                                to_add = min(len, global_rsv->size -
2764                                             global_rsv->reserved);
2765                                global_rsv->reserved += to_add;
2766                                btrfs_space_info_update_bytes_may_use(fs_info,
2767                                                space_info, to_add);
2768                                if (global_rsv->reserved >= global_rsv->size)
2769                                        global_rsv->full = 1;
2770                                len -= to_add;
2771                        }
2772                        spin_unlock(&global_rsv->lock);
2773                }
2774                /* Add to any tickets we may have */
2775                if (!readonly && return_free_space && len)
2776                        btrfs_try_granting_tickets(fs_info, space_info);
2777                spin_unlock(&space_info->lock);
2778        }
2779
2780        if (cache)
2781                btrfs_put_block_group(cache);
2782        return 0;
2783}
2784
2785int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
2786{
2787        struct btrfs_fs_info *fs_info = trans->fs_info;
2788        struct btrfs_block_group *block_group, *tmp;
2789        struct list_head *deleted_bgs;
2790        struct extent_io_tree *unpin;
2791        u64 start;
2792        u64 end;
2793        int ret;
2794
2795        unpin = &trans->transaction->pinned_extents;
2796
2797        while (!TRANS_ABORTED(trans)) {
2798                struct extent_state *cached_state = NULL;
2799
2800                mutex_lock(&fs_info->unused_bg_unpin_mutex);
2801                ret = find_first_extent_bit(unpin, 0, &start, &end,
2802                                            EXTENT_DIRTY, &cached_state);
2803                if (ret) {
2804                        mutex_unlock(&fs_info->unused_bg_unpin_mutex);
2805                        break;
2806                }
2807
2808                if (btrfs_test_opt(fs_info, DISCARD_SYNC))
2809                        ret = btrfs_discard_extent(fs_info, start,
2810                                                   end + 1 - start, NULL);
2811
2812                clear_extent_dirty(unpin, start, end, &cached_state);
2813                unpin_extent_range(fs_info, start, end, true);
2814                mutex_unlock(&fs_info->unused_bg_unpin_mutex);
2815                free_extent_state(cached_state);
2816                cond_resched();
2817        }
2818
2819        if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) {
2820                btrfs_discard_calc_delay(&fs_info->discard_ctl);
2821                btrfs_discard_schedule_work(&fs_info->discard_ctl, true);
2822        }
2823
2824        /*
2825         * Transaction is finished.  We don't need the lock anymore.  We
2826         * do need to clean up the block groups in case of a transaction
2827         * abort.
2828         */
2829        deleted_bgs = &trans->transaction->deleted_bgs;
2830        list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
2831                u64 trimmed = 0;
2832
2833                ret = -EROFS;
2834                if (!TRANS_ABORTED(trans))
2835                        ret = btrfs_discard_extent(fs_info,
2836                                                   block_group->start,
2837                                                   block_group->length,
2838                                                   &trimmed);
2839
2840                list_del_init(&block_group->bg_list);
2841                btrfs_unfreeze_block_group(block_group);
2842                btrfs_put_block_group(block_group);
2843
2844                if (ret) {
2845                        const char *errstr = btrfs_decode_error(ret);
2846                        btrfs_warn(fs_info,
2847                           "discard failed while removing blockgroup: errno=%d %s",
2848                                   ret, errstr);
2849                }
2850        }
2851
2852        return 0;
2853}
2854
2855/*
2856 * Drop one or more refs of @node.
2857 *
2858 * 1. Locate the extent refs.
2859 *    It's either inline in EXTENT/METADATA_ITEM or in keyed SHARED_* item.
2860 *    Locate it, then reduce the refs number or remove the ref line completely.
2861 *
2862 * 2. Update the refs count in EXTENT/METADATA_ITEM
2863 *
2864 * Inline backref case:
2865 *
2866 * in extent tree we have:
2867 *
2868 *      item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
2869 *              refs 2 gen 6 flags DATA
2870 *              extent data backref root FS_TREE objectid 258 offset 0 count 1
2871 *              extent data backref root FS_TREE objectid 257 offset 0 count 1
2872 *
2873 * This function gets called with:
2874 *
2875 *    node->bytenr = 13631488
2876 *    node->num_bytes = 1048576
2877 *    root_objectid = FS_TREE
2878 *    owner_objectid = 257
2879 *    owner_offset = 0
2880 *    refs_to_drop = 1
2881 *
2882 * Then we should get some like:
2883 *
2884 *      item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
2885 *              refs 1 gen 6 flags DATA
2886 *              extent data backref root FS_TREE objectid 258 offset 0 count 1
2887 *
2888 * Keyed backref case:
2889 *
2890 * in extent tree we have:
2891 *
2892 *      item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
2893 *              refs 754 gen 6 flags DATA
2894 *      [...]
2895 *      item 2 key (13631488 EXTENT_DATA_REF <HASH>) itemoff 3915 itemsize 28
2896 *              extent data backref root FS_TREE objectid 866 offset 0 count 1
2897 *
2898 * This function get called with:
2899 *
2900 *    node->bytenr = 13631488
2901 *    node->num_bytes = 1048576
2902 *    root_objectid = FS_TREE
2903 *    owner_objectid = 866
2904 *    owner_offset = 0
2905 *    refs_to_drop = 1
2906 *
2907 * Then we should get some like:
2908 *
2909 *      item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
2910 *              refs 753 gen 6 flags DATA
2911 *
2912 * And that (13631488 EXTENT_DATA_REF <HASH>) gets removed.
2913 */
2914static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
2915                               struct btrfs_delayed_ref_node *node, u64 parent,
2916                               u64 root_objectid, u64 owner_objectid,
2917                               u64 owner_offset, int refs_to_drop,
2918                               struct btrfs_delayed_extent_op *extent_op)
2919{
2920        struct btrfs_fs_info *info = trans->fs_info;
2921        struct btrfs_key key;
2922        struct btrfs_path *path;
2923        struct btrfs_root *extent_root = info->extent_root;
2924        struct extent_buffer *leaf;
2925        struct btrfs_extent_item *ei;
2926        struct btrfs_extent_inline_ref *iref;
2927        int ret;
2928        int is_data;
2929        int extent_slot = 0;
2930        int found_extent = 0;
2931        int num_to_del = 1;
2932        u32 item_size;
2933        u64 refs;
2934        u64 bytenr = node->bytenr;
2935        u64 num_bytes = node->num_bytes;
2936        int last_ref = 0;
2937        bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
2938
2939        path = btrfs_alloc_path();
2940        if (!path)
2941                return -ENOMEM;
2942
2943        is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
2944
2945        if (!is_data && refs_to_drop != 1) {
2946                btrfs_crit(info,
2947"invalid refs_to_drop, dropping more than 1 refs for tree block %llu refs_to_drop %u",
2948                           node->bytenr, refs_to_drop);
2949                ret = -EINVAL;
2950                btrfs_abort_transaction(trans, ret);
2951                goto out;
2952        }
2953
2954        if (is_data)
2955                skinny_metadata = false;
2956
2957        ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes,
2958                                    parent, root_objectid, owner_objectid,
2959                                    owner_offset);
2960        if (ret == 0) {
2961                /*
2962                 * Either the inline backref or the SHARED_DATA_REF/
2963                 * SHARED_BLOCK_REF is found
2964                 *
2965                 * Here is a quick path to locate EXTENT/METADATA_ITEM.
2966                 * It's possible the EXTENT/METADATA_ITEM is near current slot.
2967                 */
2968                extent_slot = path->slots[0];
2969                while (extent_slot >= 0) {
2970                        btrfs_item_key_to_cpu(path->nodes[0], &key,
2971                                              extent_slot);
2972                        if (key.objectid != bytenr)
2973                                break;
2974                        if (key.type == BTRFS_EXTENT_ITEM_KEY &&
2975                            key.offset == num_bytes) {
2976                                found_extent = 1;
2977                                break;
2978                        }
2979                        if (key.type == BTRFS_METADATA_ITEM_KEY &&
2980                            key.offset == owner_objectid) {
2981                                found_extent = 1;
2982                                break;
2983                        }
2984
2985                        /* Quick path didn't find the EXTEMT/METADATA_ITEM */
2986                        if (path->slots[0] - extent_slot > 5)
2987                                break;
2988                        extent_slot--;
2989                }
2990
2991                if (!found_extent) {
2992                        if (iref) {
2993                                btrfs_crit(info,
2994"invalid iref, no EXTENT/METADATA_ITEM found but has inline extent ref");
2995                                btrfs_abort_transaction(trans, -EUCLEAN);
2996                                goto err_dump;
2997                        }
2998                        /* Must be SHARED_* item, remove the backref first */
2999                        ret = remove_extent_backref(trans, path, NULL,
3000                                                    refs_to_drop,
3001                                                    is_data, &last_ref);
3002                        if (ret) {
3003                                btrfs_abort_transaction(trans, ret);
3004                                goto out;
3005                        }
3006                        btrfs_release_path(path);
3007
3008                        /* Slow path to locate EXTENT/METADATA_ITEM */
3009                        key.objectid = bytenr;
3010                        key.type = BTRFS_EXTENT_ITEM_KEY;
3011                        key.offset = num_bytes;
3012
3013                        if (!is_data && skinny_metadata) {
3014                                key.type = BTRFS_METADATA_ITEM_KEY;
3015                                key.offset = owner_objectid;
3016                        }
3017
3018                        ret = btrfs_search_slot(trans, extent_root,
3019                                                &key, path, -1, 1);
3020                        if (ret > 0 && skinny_metadata && path->slots[0]) {
3021                                /*
3022                                 * Couldn't find our skinny metadata item,
3023                                 * see if we have ye olde extent item.
3024                                 */
3025                                path->slots[0]--;
3026                                btrfs_item_key_to_cpu(path->nodes[0], &key,
3027                                                      path->slots[0]);
3028                                if (key.objectid == bytenr &&
3029                                    key.type == BTRFS_EXTENT_ITEM_KEY &&
3030                                    key.offset == num_bytes)
3031                                        ret = 0;
3032                        }
3033
3034                        if (ret > 0 && skinny_metadata) {
3035                                skinny_metadata = false;
3036                                key.objectid = bytenr;
3037                                key.type = BTRFS_EXTENT_ITEM_KEY;
3038                                key.offset = num_bytes;
3039                                btrfs_release_path(path);
3040                                ret = btrfs_search_slot(trans, extent_root,
3041                                                        &key, path, -1, 1);
3042                        }
3043
3044                        if (ret) {
3045                                btrfs_err(info,
3046                                          "umm, got %d back from search, was looking for %llu",
3047                                          ret, bytenr);
3048                                if (ret > 0)
3049                                        btrfs_print_leaf(path->nodes[0]);
3050                        }
3051                        if (ret < 0) {
3052                                btrfs_abort_transaction(trans, ret);
3053                                goto out;
3054                        }
3055                        extent_slot = path->slots[0];
3056                }
3057        } else if (WARN_ON(ret == -ENOENT)) {
3058                btrfs_print_leaf(path->nodes[0]);
3059                btrfs_err(info,
3060                        "unable to find ref byte nr %llu parent %llu root %llu  owner %llu offset %llu",
3061                        bytenr, parent, root_objectid, owner_objectid,
3062                        owner_offset);
3063                btrfs_abort_transaction(trans, ret);
3064                goto out;
3065        } else {
3066                btrfs_abort_transaction(trans, ret);
3067                goto out;
3068        }
3069
3070        leaf = path->nodes[0];
3071        item_size = btrfs_item_size_nr(leaf, extent_slot);
3072        if (unlikely(item_size < sizeof(*ei))) {
3073                ret = -EINVAL;
3074                btrfs_print_v0_err(info);
3075                btrfs_abort_transaction(trans, ret);
3076                goto out;
3077        }
3078        ei = btrfs_item_ptr(leaf, extent_slot,
3079                            struct btrfs_extent_item);
3080        if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
3081            key.type == BTRFS_EXTENT_ITEM_KEY) {
3082                struct btrfs_tree_block_info *bi;
3083                if (item_size < sizeof(*ei) + sizeof(*bi)) {
3084                        btrfs_crit(info,
3085"invalid extent item size for key (%llu, %u, %llu) owner %llu, has %u expect >= %zu",
3086                                   key.objectid, key.type, key.offset,
3087                                   owner_objectid, item_size,
3088                                   sizeof(*ei) + sizeof(*bi));
3089                        btrfs_abort_transaction(trans, -EUCLEAN);
3090                        goto err_dump;
3091                }
3092                bi = (struct btrfs_tree_block_info *)(ei + 1);
3093                WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
3094        }
3095
3096        refs = btrfs_extent_refs(leaf, ei);
3097        if (refs < refs_to_drop) {
3098                btrfs_crit(info,
3099                "trying to drop %d refs but we only have %llu for bytenr %llu",
3100                          refs_to_drop, refs, bytenr);
3101                btrfs_abort_transaction(trans, -EUCLEAN);
3102                goto err_dump;
3103        }
3104        refs -= refs_to_drop;
3105
3106        if (refs > 0) {
3107                if (extent_op)
3108                        __run_delayed_extent_op(extent_op, leaf, ei);
3109                /*
3110                 * In the case of inline back ref, reference count will
3111                 * be updated by remove_extent_backref
3112                 */
3113                if (iref) {
3114                        if (!found_extent) {
3115                                btrfs_crit(info,
3116"invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found");
3117                                btrfs_abort_transaction(trans, -EUCLEAN);
3118                                goto err_dump;
3119                        }
3120                } else {
3121                        btrfs_set_extent_refs(leaf, ei, refs);
3122                        btrfs_mark_buffer_dirty(leaf);
3123                }
3124                if (found_extent) {
3125                        ret = remove_extent_backref(trans, path, iref,
3126                                                    refs_to_drop, is_data,
3127                                                    &last_ref);
3128                        if (ret) {
3129                                btrfs_abort_transaction(trans, ret);
3130                                goto out;
3131                        }
3132                }
3133        } else {
3134                /* In this branch refs == 1 */
3135                if (found_extent) {
3136                        if (is_data && refs_to_drop !=
3137                            extent_data_ref_count(path, iref)) {
3138                                btrfs_crit(info,
3139                "invalid refs_to_drop, current refs %u refs_to_drop %u",
3140                                           extent_data_ref_count(path, iref),
3141                                           refs_to_drop);
3142                                btrfs_abort_transaction(trans, -EUCLEAN);
3143                                goto err_dump;
3144                        }
3145                        if (iref) {
3146                                if (path->slots[0] != extent_slot) {
3147                                        btrfs_crit(info,
3148"invalid iref, extent item key (%llu %u %llu) doesn't have wanted iref",
3149                                                   key.objectid, key.type,
3150                                                   key.offset);
3151                                        btrfs_abort_transaction(trans, -EUCLEAN);
3152                                        goto err_dump;
3153                                }
3154                        } else {
3155                                /*
3156                                 * No inline ref, we must be at SHARED_* item,
3157                                 * And it's single ref, it must be:
3158                                 * |    extent_slot       ||extent_slot + 1|
3159                                 * [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ]
3160                                 */
3161                                if (path->slots[0] != extent_slot + 1) {
3162                                        btrfs_crit(info,
3163        "invalid SHARED_* item, previous item is not EXTENT/METADATA_ITEM");
3164                                        btrfs_abort_transaction(trans, -EUCLEAN);
3165                                        goto err_dump;
3166                                }
3167                                path->slots[0] = extent_slot;
3168                                num_to_del = 2;
3169                        }
3170                }
3171
3172                last_ref = 1;
3173                ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
3174                                      num_to_del);
3175                if (ret) {
3176                        btrfs_abort_transaction(trans, ret);
3177                        goto out;
3178                }
3179                btrfs_release_path(path);
3180
3181                if (is_data) {
3182                        ret = btrfs_del_csums(trans, info->csum_root, bytenr,
3183                                              num_bytes);
3184                        if (ret) {
3185                                btrfs_abort_transaction(trans, ret);
3186                                goto out;
3187                        }
3188                }
3189
3190                ret = add_to_free_space_tree(trans, bytenr, num_bytes);
3191                if (ret) {
3192                        btrfs_abort_transaction(trans, ret);
3193                        goto out;
3194                }
3195
3196                ret = btrfs_update_block_group(trans, bytenr, num_bytes, false);
3197                if (ret) {
3198                        btrfs_abort_transaction(trans, ret);
3199                        goto out;
3200                }
3201        }
3202        btrfs_release_path(path);
3203
3204out:
3205        btrfs_free_path(path);
3206        return ret;
3207err_dump:
3208        /*
3209         * Leaf dump can take up a lot of log buffer, so we only do full leaf
3210         * dump for debug build.
3211         */
3212        if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) {
3213                btrfs_crit(info, "path->slots[0]=%d extent_slot=%d",
3214                           path->slots[0], extent_slot);
3215                btrfs_print_leaf(path->nodes[0]);
3216        }
3217
3218        btrfs_free_path(path);
3219        return -EUCLEAN;
3220}
3221
3222/*
3223 * when we free an block, it is possible (and likely) that we free the last
3224 * delayed ref for that extent as well.  This searches the delayed ref tree for
3225 * a given extent, and if there are no other delayed refs to be processed, it
3226 * removes it from the tree.
3227 */
3228static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
3229                                      u64 bytenr)
3230{
3231        struct btrfs_delayed_ref_head *head;
3232        struct btrfs_delayed_ref_root *delayed_refs;
3233        int ret = 0;
3234
3235        delayed_refs = &trans->transaction->delayed_refs;
3236        spin_lock(&delayed_refs->lock);
3237        head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
3238        if (!head)
3239                goto out_delayed_unlock;
3240
3241        spin_lock(&head->lock);
3242        if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root))
3243                goto out;
3244
3245        if (cleanup_extent_op(head) != NULL)
3246                goto out;
3247
3248        /*
3249         * waiting for the lock here would deadlock.  If someone else has it
3250         * locked they are already in the process of dropping it anyway
3251         */
3252        if (!mutex_trylock(&head->mutex))
3253                goto out;
3254
3255        btrfs_delete_ref_head(delayed_refs, head);
3256        head->processing = 0;
3257
3258        spin_unlock(&head->lock);
3259        spin_unlock(&delayed_refs->lock);
3260
3261        BUG_ON(head->extent_op);
3262        if (head->must_insert_reserved)
3263                ret = 1;
3264
3265        btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head);
3266        mutex_unlock(&head->mutex);
3267        btrfs_put_delayed_ref_head(head);
3268        return ret;
3269out:
3270        spin_unlock(&head->lock);
3271
3272out_delayed_unlock:
3273        spin_unlock(&delayed_refs->lock);
3274        return 0;
3275}
3276
3277void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
3278                           u64 root_id,
3279                           struct extent_buffer *buf,
3280                           u64 parent, int last_ref)
3281{
3282        struct btrfs_fs_info *fs_info = trans->fs_info;
3283        struct btrfs_ref generic_ref = { 0 };
3284        int ret;
3285
3286        btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
3287                               buf->start, buf->len, parent);
3288        btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf),
3289                            root_id, 0, false);
3290
3291        if (root_id != BTRFS_TREE_LOG_OBJECTID) {
3292                btrfs_ref_tree_mod(fs_info, &generic_ref);
3293                ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL);
3294                BUG_ON(ret); /* -ENOMEM */
3295        }
3296
3297        if (last_ref && btrfs_header_generation(buf) == trans->transid) {
3298                struct btrfs_block_group *cache;
3299                bool must_pin = false;
3300
3301                if (root_id != BTRFS_TREE_LOG_OBJECTID) {
3302                        ret = check_ref_cleanup(trans, buf->start);
3303                        if (!ret) {
3304                                btrfs_redirty_list_add(trans->transaction, buf);
3305                                goto out;
3306                        }
3307                }
3308
3309                cache = btrfs_lookup_block_group(fs_info, buf->start);
3310
3311                if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
3312                        pin_down_extent(trans, cache, buf->start, buf->len, 1);
3313                        btrfs_put_block_group(cache);
3314                        goto out;
3315                }
3316
3317                /*
3318                 * If this is a leaf and there are tree mod log users, we may
3319                 * have recorded mod log operations that point to this leaf.
3320                 * So we must make sure no one reuses this leaf's extent before
3321                 * mod log operations are applied to a node, otherwise after
3322                 * rewinding a node using the mod log operations we get an
3323                 * inconsistent btree, as the leaf's extent may now be used as
3324                 * a node or leaf for another different btree.
3325                 * We are safe from races here because at this point no other
3326                 * node or root points to this extent buffer, so if after this
3327                 * check a new tree mod log user joins, it will not be able to
3328                 * find a node pointing to this leaf and record operations that
3329                 * point to this leaf.
3330                 */
3331                if (btrfs_header_level(buf) == 0 &&
3332                    test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
3333                        must_pin = true;
3334
3335                if (must_pin || btrfs_is_zoned(fs_info)) {
3336                        btrfs_redirty_list_add(trans->transaction, buf);
3337                        pin_down_extent(trans, cache, buf->start, buf->len, 1);
3338                        btrfs_put_block_group(cache);
3339                        goto out;
3340                }
3341
3342                WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
3343
3344                btrfs_add_free_space(cache, buf->start, buf->len);
3345                btrfs_free_reserved_bytes(cache, buf->len, 0);
3346                btrfs_put_block_group(cache);
3347                trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
3348        }
3349out:
3350        if (last_ref) {
3351                /*
3352                 * Deleting the buffer, clear the corrupt flag since it doesn't
3353                 * matter anymore.
3354                 */
3355                clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
3356        }
3357}
3358
3359/* Can return -ENOMEM */
3360int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
3361{
3362        struct btrfs_fs_info *fs_info = trans->fs_info;
3363        int ret;
3364
3365        if (btrfs_is_testing(fs_info))
3366                return 0;
3367
3368        /*
3369         * tree log blocks never actually go into the extent allocation
3370         * tree, just update pinning info and exit early.
3371         */
3372        if ((ref->type == BTRFS_REF_METADATA &&
3373             ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) ||
3374            (ref->type == BTRFS_REF_DATA &&
3375             ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID)) {
3376                /* unlocks the pinned mutex */
3377                btrfs_pin_extent(trans, ref->bytenr, ref->len, 1);
3378                ret = 0;
3379        } else if (ref->type == BTRFS_REF_METADATA) {
3380                ret = btrfs_add_delayed_tree_ref(trans, ref, NULL);
3381        } else {
3382                ret = btrfs_add_delayed_data_ref(trans, ref, 0);
3383        }
3384
3385        if (!((ref->type == BTRFS_REF_METADATA &&
3386               ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) ||
3387              (ref->type == BTRFS_REF_DATA &&
3388               ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID)))
3389                btrfs_ref_tree_mod(fs_info, ref);
3390
3391        return ret;
3392}
3393
3394enum btrfs_loop_type {
3395        LOOP_CACHING_NOWAIT,
3396        LOOP_CACHING_WAIT,
3397        LOOP_ALLOC_CHUNK,
3398        LOOP_NO_EMPTY_SIZE,
3399};
3400
3401static inline void
3402btrfs_lock_block_group(struct btrfs_block_group *cache,
3403                       int delalloc)
3404{
3405        if (delalloc)
3406                down_read(&cache->data_rwsem);
3407}
3408
3409static inline void btrfs_grab_block_group(struct btrfs_block_group *cache,
3410                       int delalloc)
3411{
3412        btrfs_get_block_group(cache);
3413        if (delalloc)
3414                down_read(&cache->data_rwsem);
3415}
3416
3417static struct btrfs_block_group *btrfs_lock_cluster(
3418                   struct btrfs_block_group *block_group,
3419                   struct btrfs_free_cluster *cluster,
3420                   int delalloc)
3421        __acquires(&cluster->refill_lock)
3422{
3423        struct btrfs_block_group *used_bg = NULL;
3424
3425        spin_lock(&cluster->refill_lock);
3426        while (1) {
3427                used_bg = cluster->block_group;
3428                if (!used_bg)
3429                        return NULL;
3430
3431                if (used_bg == block_group)
3432                        return used_bg;
3433
3434                btrfs_get_block_group(used_bg);
3435
3436                if (!delalloc)
3437                        return used_bg;
3438
3439                if (down_read_trylock(&used_bg->data_rwsem))
3440                        return used_bg;
3441
3442                spin_unlock(&cluster->refill_lock);
3443
3444                /* We should only have one-level nested. */
3445                down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING);
3446
3447                spin_lock(&cluster->refill_lock);
3448                if (used_bg == cluster->block_group)
3449                        return used_bg;
3450
3451                up_read(&used_bg->data_rwsem);
3452                btrfs_put_block_group(used_bg);
3453        }
3454}
3455
3456static inline void
3457btrfs_release_block_group(struct btrfs_block_group *cache,
3458                         int delalloc)
3459{
3460        if (delalloc)
3461                up_read(&cache->data_rwsem);
3462        btrfs_put_block_group(cache);
3463}
3464
3465enum btrfs_extent_allocation_policy {
3466        BTRFS_EXTENT_ALLOC_CLUSTERED,
3467        BTRFS_EXTENT_ALLOC_ZONED,
3468};
3469
3470/*
3471 * Structure used internally for find_free_extent() function.  Wraps needed
3472 * parameters.
3473 */
3474struct find_free_extent_ctl {
3475        /* Basic allocation info */
3476        u64 ram_bytes;
3477        u64 num_bytes;
3478        u64 min_alloc_size;
3479        u64 empty_size;
3480        u64 flags;
3481        int delalloc;
3482
3483        /* Where to start the search inside the bg */
3484        u64 search_start;
3485
3486        /* For clustered allocation */
3487        u64 empty_cluster;
3488        struct btrfs_free_cluster *last_ptr;
3489        bool use_cluster;
3490
3491        bool have_caching_bg;
3492        bool orig_have_caching_bg;
3493
3494        /* Allocation is called for tree-log */
3495        bool for_treelog;
3496
3497        /* Allocation is called for data relocation */
3498        bool for_data_reloc;
3499
3500        /* RAID index, converted from flags */
3501        int index;
3502
3503        /*
3504         * Current loop number, check find_free_extent_update_loop() for details
3505         */
3506        int loop;
3507
3508        /*
3509         * Whether we're refilling a cluster, if true we need to re-search
3510         * current block group but don't try to refill the cluster again.
3511         */
3512        bool retry_clustered;
3513
3514        /*
3515         * Whether we're updating free space cache, if true we need to re-search
3516         * current block group but don't try updating free space cache again.
3517         */
3518        bool retry_unclustered;
3519
3520        /* If current block group is cached */
3521        int cached;
3522
3523        /* Max contiguous hole found */
3524        u64 max_extent_size;
3525
3526        /* Total free space from free space cache, not always contiguous */
3527        u64 total_free_space;
3528
3529        /* Found result */
3530        u64 found_offset;
3531
3532        /* Hint where to start looking for an empty space */
3533        u64 hint_byte;
3534
3535        /* Allocation policy */
3536        enum btrfs_extent_allocation_policy policy;
3537};
3538
3539
3540/*
3541 * Helper function for find_free_extent().
3542 *
3543 * Return -ENOENT to inform caller that we need fallback to unclustered mode.
3544 * Return -EAGAIN to inform caller that we need to re-search this block group
3545 * Return >0 to inform caller that we find nothing
3546 * Return 0 means we have found a location and set ffe_ctl->found_offset.
3547 */
3548static int find_free_extent_clustered(struct btrfs_block_group *bg,
3549                                      struct find_free_extent_ctl *ffe_ctl,
3550                                      struct btrfs_block_group **cluster_bg_ret)
3551{
3552        struct btrfs_block_group *cluster_bg;
3553        struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
3554        u64 aligned_cluster;
3555        u64 offset;
3556        int ret;
3557
3558        cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc);
3559        if (!cluster_bg)
3560                goto refill_cluster;
3561        if (cluster_bg != bg && (cluster_bg->ro ||
3562            !block_group_bits(cluster_bg, ffe_ctl->flags)))
3563                goto release_cluster;
3564
3565        offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
3566                        ffe_ctl->num_bytes, cluster_bg->start,
3567                        &ffe_ctl->max_extent_size);
3568        if (offset) {
3569                /* We have a block, we're done */
3570                spin_unlock(&last_ptr->refill_lock);
3571                trace_btrfs_reserve_extent_cluster(cluster_bg,
3572                                ffe_ctl->search_start, ffe_ctl->num_bytes);
3573                *cluster_bg_ret = cluster_bg;
3574                ffe_ctl->found_offset = offset;
3575                return 0;
3576        }
3577        WARN_ON(last_ptr->block_group != cluster_bg);
3578
3579release_cluster:
3580        /*
3581         * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so
3582         * lets just skip it and let the allocator find whatever block it can
3583         * find. If we reach this point, we will have tried the cluster
3584         * allocator plenty of times and not have found anything, so we are
3585         * likely way too fragmented for the clustering stuff to find anything.
3586         *
3587         * However, if the cluster is taken from the current block group,
3588         * release the cluster first, so that we stand a better chance of
3589         * succeeding in the unclustered allocation.
3590         */
3591        if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) {
3592                spin_unlock(&last_ptr->refill_lock);
3593                btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
3594                return -ENOENT;
3595        }
3596
3597        /* This cluster didn't work out, free it and start over */
3598        btrfs_return_cluster_to_free_space(NULL, last_ptr);
3599
3600        if (cluster_bg != bg)
3601                btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
3602
3603refill_cluster:
3604        if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) {
3605                spin_unlock(&last_ptr->refill_lock);
3606                return -ENOENT;
3607        }
3608
3609        aligned_cluster = max_t(u64,
3610                        ffe_ctl->empty_cluster + ffe_ctl->empty_size,
3611                        bg->full_stripe_len);
3612        ret = btrfs_find_space_cluster(bg, last_ptr, ffe_ctl->search_start,
3613                        ffe_ctl->num_bytes, aligned_cluster);
3614        if (ret == 0) {
3615                /* Now pull our allocation out of this cluster */
3616                offset = btrfs_alloc_from_cluster(bg, last_ptr,
3617                                ffe_ctl->num_bytes, ffe_ctl->search_start,
3618                                &ffe_ctl->max_extent_size);
3619                if (offset) {
3620                        /* We found one, proceed */
3621                        spin_unlock(&last_ptr->refill_lock);
3622                        trace_btrfs_reserve_extent_cluster(bg,
3623                                        ffe_ctl->search_start,
3624                                        ffe_ctl->num_bytes);
3625                        ffe_ctl->found_offset = offset;
3626                        return 0;
3627                }
3628        } else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
3629                   !ffe_ctl->retry_clustered) {
3630                spin_unlock(&last_ptr->refill_lock);
3631
3632                ffe_ctl->retry_clustered = true;
3633                btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
3634                                ffe_ctl->empty_cluster + ffe_ctl->empty_size);
3635                return -EAGAIN;
3636        }
3637        /*
3638         * At this point we either didn't find a cluster or we weren't able to
3639         * allocate a block from our cluster.  Free the cluster we've been
3640         * trying to use, and go to the next block group.
3641         */
3642        btrfs_return_cluster_to_free_space(NULL, last_ptr);
3643        spin_unlock(&last_ptr->refill_lock);
3644        return 1;
3645}
3646
3647/*
3648 * Return >0 to inform caller that we find nothing
3649 * Return 0 when we found an free extent and set ffe_ctrl->found_offset
3650 * Return -EAGAIN to inform caller that we need to re-search this block group
3651 */
3652static int find_free_extent_unclustered(struct btrfs_block_group *bg,
3653                                        struct find_free_extent_ctl *ffe_ctl)
3654{
3655        struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
3656        u64 offset;
3657
3658        /*
3659         * We are doing an unclustered allocation, set the fragmented flag so
3660         * we don't bother trying to setup a cluster again until we get more
3661         * space.
3662         */
3663        if (unlikely(last_ptr)) {
3664                spin_lock(&last_ptr->lock);
3665                last_ptr->fragmented = 1;
3666                spin_unlock(&last_ptr->lock);
3667        }
3668        if (ffe_ctl->cached) {
3669                struct btrfs_free_space_ctl *free_space_ctl;
3670
3671                free_space_ctl = bg->free_space_ctl;
3672                spin_lock(&free_space_ctl->tree_lock);
3673                if (free_space_ctl->free_space <
3674                    ffe_ctl->num_bytes + ffe_ctl->empty_cluster +
3675                    ffe_ctl->empty_size) {
3676                        ffe_ctl->total_free_space = max_t(u64,
3677                                        ffe_ctl->total_free_space,
3678                                        free_space_ctl->free_space);
3679                        spin_unlock(&free_space_ctl->tree_lock);
3680                        return 1;
3681                }
3682                spin_unlock(&free_space_ctl->tree_lock);
3683        }
3684
3685        offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start,
3686                        ffe_ctl->num_bytes, ffe_ctl->empty_size,
3687                        &ffe_ctl->max_extent_size);
3688
3689        /*
3690         * If we didn't find a chunk, and we haven't failed on this block group
3691         * before, and this block group is in the middle of caching and we are
3692         * ok with waiting, then go ahead and wait for progress to be made, and
3693         * set @retry_unclustered to true.
3694         *
3695         * If @retry_unclustered is true then we've already waited on this
3696         * block group once and should move on to the next block group.
3697         */
3698        if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached &&
3699            ffe_ctl->loop > LOOP_CACHING_NOWAIT) {
3700                btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
3701                                                      ffe_ctl->empty_size);
3702                ffe_ctl->retry_unclustered = true;
3703                return -EAGAIN;
3704        } else if (!offset) {
3705                return 1;
3706        }
3707        ffe_ctl->found_offset = offset;
3708        return 0;
3709}
3710
3711static int do_allocation_clustered(struct btrfs_block_group *block_group,
3712                                   struct find_free_extent_ctl *ffe_ctl,
3713                                   struct btrfs_block_group **bg_ret)
3714{
3715        int ret;
3716
3717        /* We want to try and use the cluster allocator, so lets look there */
3718        if (ffe_ctl->last_ptr && ffe_ctl->use_cluster) {
3719                ret = find_free_extent_clustered(block_group, ffe_ctl, bg_ret);
3720                if (ret >= 0 || ret == -EAGAIN)
3721                        return ret;
3722                /* ret == -ENOENT case falls through */
3723        }
3724
3725        return find_free_extent_unclustered(block_group, ffe_ctl);
3726}
3727
3728/*
3729 * Tree-log block group locking
3730 * ============================
3731 *
3732 * fs_info::treelog_bg_lock protects the fs_info::treelog_bg which
3733 * indicates the starting address of a block group, which is reserved only
3734 * for tree-log metadata.
3735 *
3736 * Lock nesting
3737 * ============
3738 *
3739 * space_info::lock
3740 *   block_group::lock
3741 *     fs_info::treelog_bg_lock
3742 */
3743
3744/*
3745 * Simple allocator for sequential-only block group. It only allows sequential
3746 * allocation. No need to play with trees. This function also reserves the
3747 * bytes as in btrfs_add_reserved_bytes.
3748 */
3749static int do_allocation_zoned(struct btrfs_block_group *block_group,
3750                               struct find_free_extent_ctl *ffe_ctl,
3751                               struct btrfs_block_group **bg_ret)
3752{
3753        struct btrfs_fs_info *fs_info = block_group->fs_info;
3754        struct btrfs_space_info *space_info = block_group->space_info;
3755        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3756        u64 start = block_group->start;
3757        u64 num_bytes = ffe_ctl->num_bytes;
3758        u64 avail;
3759        u64 bytenr = block_group->start;
3760        u64 log_bytenr;
3761        u64 data_reloc_bytenr;
3762        int ret = 0;
3763        bool skip = false;
3764
3765        ASSERT(btrfs_is_zoned(block_group->fs_info));
3766
3767        /*
3768         * Do not allow non-tree-log blocks in the dedicated tree-log block
3769         * group, and vice versa.
3770         */
3771        spin_lock(&fs_info->treelog_bg_lock);
3772        log_bytenr = fs_info->treelog_bg;
3773        if (log_bytenr && ((ffe_ctl->for_treelog && bytenr != log_bytenr) ||
3774                           (!ffe_ctl->for_treelog && bytenr == log_bytenr)))
3775                skip = true;
3776        spin_unlock(&fs_info->treelog_bg_lock);
3777        if (skip)
3778                return 1;
3779
3780        /*
3781         * Do not allow non-relocation blocks in the dedicated relocation block
3782         * group, and vice versa.
3783         */
3784        spin_lock(&fs_info->relocation_bg_lock);
3785        data_reloc_bytenr = fs_info->data_reloc_bg;
3786        if (data_reloc_bytenr &&
3787            ((ffe_ctl->for_data_reloc && bytenr != data_reloc_bytenr) ||
3788             (!ffe_ctl->for_data_reloc && bytenr == data_reloc_bytenr)))
3789                skip = true;
3790        spin_unlock(&fs_info->relocation_bg_lock);
3791        if (skip)
3792                return 1;
3793        /* Check RO and no space case before trying to activate it */
3794        spin_lock(&block_group->lock);
3795        if (block_group->ro ||
3796            block_group->alloc_offset == block_group->zone_capacity) {
3797                spin_unlock(&block_group->lock);
3798                return 1;
3799        }
3800        spin_unlock(&block_group->lock);
3801
3802        if (!btrfs_zone_activate(block_group))
3803                return 1;
3804
3805        spin_lock(&space_info->lock);
3806        spin_lock(&block_group->lock);
3807        spin_lock(&fs_info->treelog_bg_lock);
3808        spin_lock(&fs_info->relocation_bg_lock);
3809
3810        ASSERT(!ffe_ctl->for_treelog ||
3811               block_group->start == fs_info->treelog_bg ||
3812               fs_info->treelog_bg == 0);
3813        ASSERT(!ffe_ctl->for_data_reloc ||
3814               block_group->start == fs_info->data_reloc_bg ||
3815               fs_info->data_reloc_bg == 0);
3816
3817        if (block_group->ro) {
3818                ret = 1;
3819                goto out;
3820        }
3821
3822        /*
3823         * Do not allow currently using block group to be tree-log dedicated
3824         * block group.
3825         */
3826        if (ffe_ctl->for_treelog && !fs_info->treelog_bg &&
3827            (block_group->used || block_group->reserved)) {
3828                ret = 1;
3829                goto out;
3830        }
3831
3832        /*
3833         * Do not allow currently used block group to be the data relocation
3834         * dedicated block group.
3835         */
3836        if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg &&
3837            (block_group->used || block_group->reserved)) {
3838                ret = 1;
3839                goto out;
3840        }
3841
3842        WARN_ON_ONCE(block_group->alloc_offset > block_group->zone_capacity);
3843        avail = block_group->zone_capacity - block_group->alloc_offset;
3844        if (avail < num_bytes) {
3845                if (ffe_ctl->max_extent_size < avail) {
3846                        /*
3847                         * With sequential allocator, free space is always
3848                         * contiguous
3849                         */
3850                        ffe_ctl->max_extent_size = avail;
3851                        ffe_ctl->total_free_space = avail;
3852                }
3853                ret = 1;
3854                goto out;
3855        }
3856
3857        if (ffe_ctl->for_treelog && !fs_info->treelog_bg)
3858                fs_info->treelog_bg = block_group->start;
3859
3860        if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg)
3861                fs_info->data_reloc_bg = block_group->start;
3862
3863        ffe_ctl->found_offset = start + block_group->alloc_offset;
3864        block_group->alloc_offset += num_bytes;
3865        spin_lock(&ctl->tree_lock);
3866        ctl->free_space -= num_bytes;
3867        spin_unlock(&ctl->tree_lock);
3868
3869        /*
3870         * We do not check if found_offset is aligned to stripesize. The
3871         * address is anyway rewritten when using zone append writing.
3872         */
3873
3874        ffe_ctl->search_start = ffe_ctl->found_offset;
3875
3876out:
3877        if (ret && ffe_ctl->for_treelog)
3878                fs_info->treelog_bg = 0;
3879        if (ret && ffe_ctl->for_data_reloc)
3880                fs_info->data_reloc_bg = 0;
3881        spin_unlock(&fs_info->relocation_bg_lock);
3882        spin_unlock(&fs_info->treelog_bg_lock);
3883        spin_unlock(&block_group->lock);
3884        spin_unlock(&space_info->lock);
3885        return ret;
3886}
3887
3888static int do_allocation(struct btrfs_block_group *block_group,
3889                         struct find_free_extent_ctl *ffe_ctl,
3890                         struct btrfs_block_group **bg_ret)
3891{
3892        switch (ffe_ctl->policy) {
3893        case BTRFS_EXTENT_ALLOC_CLUSTERED:
3894                return do_allocation_clustered(block_group, ffe_ctl, bg_ret);
3895        case BTRFS_EXTENT_ALLOC_ZONED:
3896                return do_allocation_zoned(block_group, ffe_ctl, bg_ret);
3897        default:
3898                BUG();
3899        }
3900}
3901
3902static void release_block_group(struct btrfs_block_group *block_group,
3903                                struct find_free_extent_ctl *ffe_ctl,
3904                                int delalloc)
3905{
3906        switch (ffe_ctl->policy) {
3907        case BTRFS_EXTENT_ALLOC_CLUSTERED:
3908                ffe_ctl->retry_clustered = false;
3909                ffe_ctl->retry_unclustered = false;
3910                break;
3911        case BTRFS_EXTENT_ALLOC_ZONED:
3912                /* Nothing to do */
3913                break;
3914        default:
3915                BUG();
3916        }
3917
3918        BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
3919               ffe_ctl->index);
3920        btrfs_release_block_group(block_group, delalloc);
3921}
3922
3923static void found_extent_clustered(struct find_free_extent_ctl *ffe_ctl,
3924                                   struct btrfs_key *ins)
3925{
3926        struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
3927
3928        if (!ffe_ctl->use_cluster && last_ptr) {
3929                spin_lock(&last_ptr->lock);
3930                last_ptr->window_start = ins->objectid;
3931                spin_unlock(&last_ptr->lock);
3932        }
3933}
3934
3935static void found_extent(struct find_free_extent_ctl *ffe_ctl,
3936                         struct btrfs_key *ins)
3937{
3938        switch (ffe_ctl->policy) {
3939        case BTRFS_EXTENT_ALLOC_CLUSTERED:
3940                found_extent_clustered(ffe_ctl, ins);
3941                break;
3942        case BTRFS_EXTENT_ALLOC_ZONED:
3943                /* Nothing to do */
3944                break;
3945        default:
3946                BUG();
3947        }
3948}
3949
3950static int chunk_allocation_failed(struct find_free_extent_ctl *ffe_ctl)
3951{
3952        switch (ffe_ctl->policy) {
3953        case BTRFS_EXTENT_ALLOC_CLUSTERED:
3954                /*
3955                 * If we can't allocate a new chunk we've already looped through
3956                 * at least once, move on to the NO_EMPTY_SIZE case.
3957                 */
3958                ffe_ctl->loop = LOOP_NO_EMPTY_SIZE;
3959                return 0;
3960        case BTRFS_EXTENT_ALLOC_ZONED:
3961                /* Give up here */
3962                return -ENOSPC;
3963        default:
3964                BUG();
3965        }
3966}
3967
3968/*
3969 * Return >0 means caller needs to re-search for free extent
3970 * Return 0 means we have the needed free extent.
3971 * Return <0 means we failed to locate any free extent.
3972 */
3973static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
3974                                        struct btrfs_key *ins,
3975                                        struct find_free_extent_ctl *ffe_ctl,
3976                                        bool full_search)
3977{
3978        struct btrfs_root *root = fs_info->extent_root;
3979        int ret;
3980
3981        if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) &&
3982            ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg)
3983                ffe_ctl->orig_have_caching_bg = true;
3984
3985        if (ins->objectid) {
3986                found_extent(ffe_ctl, ins);
3987                return 0;
3988        }
3989
3990        if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size &&
3991            !btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->index)) {
3992                /*
3993                 * If we have enough free space left in an already active block
3994                 * group and we can't activate any other zone now, retry the
3995                 * active ones with a smaller allocation size.  Returning early
3996                 * from here will tell btrfs_reserve_extent() to haven the
3997                 * size.
3998                 */
3999                return -ENOSPC;
4000        }
4001
4002        if (ffe_ctl->loop >= LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg)
4003                return 1;
4004
4005        ffe_ctl->index++;
4006        if (ffe_ctl->index < BTRFS_NR_RAID_TYPES)
4007                return 1;
4008
4009        /*
4010         * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
4011         *                      caching kthreads as we move along
4012         * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
4013         * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
4014         * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
4015         *                     again
4016         */
4017        if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) {
4018                ffe_ctl->index = 0;
4019                if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) {
4020                        /*
4021                         * We want to skip the LOOP_CACHING_WAIT step if we
4022                         * don't have any uncached bgs and we've already done a
4023                         * full search through.
4024                         */
4025                        if (ffe_ctl->orig_have_caching_bg || !full_search)
4026                                ffe_ctl->loop = LOOP_CACHING_WAIT;
4027                        else
4028                                ffe_ctl->loop = LOOP_ALLOC_CHUNK;
4029                } else {
4030                        ffe_ctl->loop++;
4031                }
4032
4033                if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) {
4034                        struct btrfs_trans_handle *trans;
4035                        int exist = 0;
4036
4037                        trans = current->journal_info;
4038                        if (trans)
4039                                exist = 1;
4040                        else
4041                                trans = btrfs_join_transaction(root);
4042
4043                        if (IS_ERR(trans)) {
4044                                ret = PTR_ERR(trans);
4045                                return ret;
4046                        }
4047
4048                        ret = btrfs_chunk_alloc(trans, ffe_ctl->flags,
4049                                                CHUNK_ALLOC_FORCE);
4050
4051                        /* Do not bail out on ENOSPC since we can do more. */
4052                        if (ret == -ENOSPC)
4053                                ret = chunk_allocation_failed(ffe_ctl);
4054                        else if (ret < 0)
4055                                btrfs_abort_transaction(trans, ret);
4056                        else
4057                                ret = 0;
4058                        if (!exist)
4059                                btrfs_end_transaction(trans);
4060                        if (ret)
4061                                return ret;
4062                }
4063
4064                if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) {
4065                        if (ffe_ctl->policy != BTRFS_EXTENT_ALLOC_CLUSTERED)
4066                                return -ENOSPC;
4067
4068                        /*
4069                         * Don't loop again if we already have no empty_size and
4070                         * no empty_cluster.
4071                         */
4072                        if (ffe_ctl->empty_size == 0 &&
4073                            ffe_ctl->empty_cluster == 0)
4074                                return -ENOSPC;
4075                        ffe_ctl->empty_size = 0;
4076                        ffe_ctl->empty_cluster = 0;
4077                }
4078                return 1;
4079        }
4080        return -ENOSPC;
4081}
4082
4083static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
4084                                        struct find_free_extent_ctl *ffe_ctl,
4085                                        struct btrfs_space_info *space_info,
4086                                        struct btrfs_key *ins)
4087{
4088        /*
4089         * If our free space is heavily fragmented we may not be able to make
4090         * big contiguous allocations, so instead of doing the expensive search
4091         * for free space, simply return ENOSPC with our max_extent_size so we
4092         * can go ahead and search for a more manageable chunk.
4093         *
4094         * If our max_extent_size is large enough for our allocation simply
4095         * disable clustering since we will likely not be able to find enough
4096         * space to create a cluster and induce latency trying.
4097         */
4098        if (space_info->max_extent_size) {
4099                spin_lock(&space_info->lock);
4100                if (space_info->max_extent_size &&
4101                    ffe_ctl->num_bytes > space_info->max_extent_size) {
4102                        ins->offset = space_info->max_extent_size;
4103                        spin_unlock(&space_info->lock);
4104                        return -ENOSPC;
4105                } else if (space_info->max_extent_size) {
4106                        ffe_ctl->use_cluster = false;
4107                }
4108                spin_unlock(&space_info->lock);
4109        }
4110
4111        ffe_ctl->last_ptr = fetch_cluster_info(fs_info, space_info,
4112                                               &ffe_ctl->empty_cluster);
4113        if (ffe_ctl->last_ptr) {
4114                struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
4115
4116                spin_lock(&last_ptr->lock);
4117                if (last_ptr->block_group)
4118                        ffe_ctl->hint_byte = last_ptr->window_start;
4119                if (last_ptr->fragmented) {
4120                        /*
4121                         * We still set window_start so we can keep track of the
4122                         * last place we found an allocation to try and save
4123                         * some time.
4124                         */
4125                        ffe_ctl->hint_byte = last_ptr->window_start;
4126                        ffe_ctl->use_cluster = false;
4127                }
4128                spin_unlock(&last_ptr->lock);
4129        }
4130
4131        return 0;
4132}
4133
4134static int prepare_allocation(struct btrfs_fs_info *fs_info,
4135                              struct find_free_extent_ctl *ffe_ctl,
4136                              struct btrfs_space_info *space_info,
4137                              struct btrfs_key *ins)
4138{
4139        switch (ffe_ctl->policy) {
4140        case BTRFS_EXTENT_ALLOC_CLUSTERED:
4141                return prepare_allocation_clustered(fs_info, ffe_ctl,
4142                                                    space_info, ins);
4143        case BTRFS_EXTENT_ALLOC_ZONED:
4144                if (ffe_ctl->for_treelog) {
4145                        spin_lock(&fs_info->treelog_bg_lock);
4146                        if (fs_info->treelog_bg)
4147                                ffe_ctl->hint_byte = fs_info->treelog_bg;
4148                        spin_unlock(&fs_info->treelog_bg_lock);
4149                }
4150                if (ffe_ctl->for_data_reloc) {
4151                        spin_lock(&fs_info->relocation_bg_lock);
4152                        if (fs_info->data_reloc_bg)
4153                                ffe_ctl->hint_byte = fs_info->data_reloc_bg;
4154                        spin_unlock(&fs_info->relocation_bg_lock);
4155                }
4156                return 0;
4157        default:
4158                BUG();
4159        }
4160}
4161
4162/*
4163 * walks the btree of allocated extents and find a hole of a given size.
4164 * The key ins is changed to record the hole:
4165 * ins->objectid == start position
4166 * ins->flags = BTRFS_EXTENT_ITEM_KEY
4167 * ins->offset == the size of the hole.
4168 * Any available blocks before search_start are skipped.
4169 *
4170 * If there is no suitable free space, we will record the max size of
4171 * the free space extent currently.
4172 *
4173 * The overall logic and call chain:
4174 *
4175 * find_free_extent()
4176 * |- Iterate through all block groups
4177 * |  |- Get a valid block group
4178 * |  |- Try to do clustered allocation in that block group
4179 * |  |- Try to do unclustered allocation in that block group
4180 * |  |- Check if the result is valid
4181 * |  |  |- If valid, then exit
4182 * |  |- Jump to next block group
4183 * |
4184 * |- Push harder to find free extents
4185 *    |- If not found, re-iterate all block groups
4186 */
4187static noinline int find_free_extent(struct btrfs_root *root,
4188                                     struct btrfs_key *ins,
4189                                     struct find_free_extent_ctl *ffe_ctl)
4190{
4191        struct btrfs_fs_info *fs_info = root->fs_info;
4192        int ret = 0;
4193        int cache_block_group_error = 0;
4194        struct btrfs_block_group *block_group = NULL;
4195        struct btrfs_space_info *space_info;
4196        bool full_search = false;
4197
4198        WARN_ON(ffe_ctl->num_bytes < fs_info->sectorsize);
4199
4200        ffe_ctl->search_start = 0;
4201        /* For clustered allocation */
4202        ffe_ctl->empty_cluster = 0;
4203        ffe_ctl->last_ptr = NULL;
4204        ffe_ctl->use_cluster = true;
4205        ffe_ctl->have_caching_bg = false;
4206        ffe_ctl->orig_have_caching_bg = false;
4207        ffe_ctl->index = btrfs_bg_flags_to_raid_index(ffe_ctl->flags);
4208        ffe_ctl->loop = 0;
4209        /* For clustered allocation */
4210        ffe_ctl->retry_clustered = false;
4211        ffe_ctl->retry_unclustered = false;
4212        ffe_ctl->cached = 0;
4213        ffe_ctl->max_extent_size = 0;
4214        ffe_ctl->total_free_space = 0;
4215        ffe_ctl->found_offset = 0;
4216        ffe_ctl->policy = BTRFS_EXTENT_ALLOC_CLUSTERED;
4217
4218        if (btrfs_is_zoned(fs_info))
4219                ffe_ctl->policy = BTRFS_EXTENT_ALLOC_ZONED;
4220
4221        ins->type = BTRFS_EXTENT_ITEM_KEY;
4222        ins->objectid = 0;
4223        ins->offset = 0;
4224
4225        trace_find_free_extent(root, ffe_ctl->num_bytes, ffe_ctl->empty_size,
4226                               ffe_ctl->flags);
4227
4228        space_info = btrfs_find_space_info(fs_info, ffe_ctl->flags);
4229        if (!space_info) {
4230                btrfs_err(fs_info, "No space info for %llu", ffe_ctl->flags);
4231                return -ENOSPC;
4232        }
4233
4234        ret = prepare_allocation(fs_info, ffe_ctl, space_info, ins);
4235        if (ret < 0)
4236                return ret;
4237
4238        ffe_ctl->search_start = max(ffe_ctl->search_start,
4239                                    first_logical_byte(fs_info, 0));
4240        ffe_ctl->search_start = max(ffe_ctl->search_start, ffe_ctl->hint_byte);
4241        if (ffe_ctl->search_start == ffe_ctl->hint_byte) {
4242                block_group = btrfs_lookup_block_group(fs_info,
4243                                                       ffe_ctl->search_start);
4244                /*
4245                 * we don't want to use the block group if it doesn't match our
4246                 * allocation bits, or if its not cached.
4247                 *
4248                 * However if we are re-searching with an ideal block group
4249                 * picked out then we don't care that the block group is cached.
4250                 */
4251                if (block_group && block_group_bits(block_group, ffe_ctl->flags) &&
4252                    block_group->cached != BTRFS_CACHE_NO) {
4253                        down_read(&space_info->groups_sem);
4254                        if (list_empty(&block_group->list) ||
4255                            block_group->ro) {
4256                                /*
4257                                 * someone is removing this block group,
4258                                 * we can't jump into the have_block_group
4259                                 * target because our list pointers are not
4260                                 * valid
4261                                 */
4262                                btrfs_put_block_group(block_group);
4263                                up_read(&space_info->groups_sem);
4264                        } else {
4265                                ffe_ctl->index = btrfs_bg_flags_to_raid_index(
4266                                                        block_group->flags);
4267                                btrfs_lock_block_group(block_group,
4268                                                       ffe_ctl->delalloc);
4269                                goto have_block_group;
4270                        }
4271                } else if (block_group) {
4272                        btrfs_put_block_group(block_group);
4273                }
4274        }
4275search:
4276        ffe_ctl->have_caching_bg = false;
4277        if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) ||
4278            ffe_ctl->index == 0)
4279                full_search = true;
4280        down_read(&space_info->groups_sem);
4281        list_for_each_entry(block_group,
4282                            &space_info->block_groups[ffe_ctl->index], list) {
4283                struct btrfs_block_group *bg_ret;
4284
4285                /* If the block group is read-only, we can skip it entirely. */
4286                if (unlikely(block_group->ro)) {
4287                        if (ffe_ctl->for_treelog)
4288                                btrfs_clear_treelog_bg(block_group);
4289                        if (ffe_ctl->for_data_reloc)
4290                                btrfs_clear_data_reloc_bg(block_group);
4291                        continue;
4292                }
4293
4294                btrfs_grab_block_group(block_group, ffe_ctl->delalloc);
4295                ffe_ctl->search_start = block_group->start;
4296
4297                /*
4298                 * this can happen if we end up cycling through all the
4299                 * raid types, but we want to make sure we only allocate
4300                 * for the proper type.
4301                 */
4302                if (!block_group_bits(block_group, ffe_ctl->flags)) {
4303                        u64 extra = BTRFS_BLOCK_GROUP_DUP |
4304                                BTRFS_BLOCK_GROUP_RAID1_MASK |
4305                                BTRFS_BLOCK_GROUP_RAID56_MASK |
4306                                BTRFS_BLOCK_GROUP_RAID10;
4307
4308                        /*
4309                         * if they asked for extra copies and this block group
4310                         * doesn't provide them, bail.  This does allow us to
4311                         * fill raid0 from raid1.
4312                         */
4313                        if ((ffe_ctl->flags & extra) && !(block_group->flags & extra))
4314                                goto loop;
4315
4316                        /*
4317                         * This block group has different flags than we want.
4318                         * It's possible that we have MIXED_GROUP flag but no
4319                         * block group is mixed.  Just skip such block group.
4320                         */
4321                        btrfs_release_block_group(block_group, ffe_ctl->delalloc);
4322                        continue;
4323                }
4324
4325have_block_group:
4326                ffe_ctl->cached = btrfs_block_group_done(block_group);
4327                if (unlikely(!ffe_ctl->cached)) {
4328                        ffe_ctl->have_caching_bg = true;
4329                        ret = btrfs_cache_block_group(block_group, 0);
4330
4331                        /*
4332                         * If we get ENOMEM here or something else we want to
4333                         * try other block groups, because it may not be fatal.
4334                         * However if we can't find anything else we need to
4335                         * save our return here so that we return the actual
4336                         * error that caused problems, not ENOSPC.
4337                         */
4338                        if (ret < 0) {
4339                                if (!cache_block_group_error)
4340                                        cache_block_group_error = ret;
4341                                ret = 0;
4342                                goto loop;
4343                        }
4344                        ret = 0;
4345                }
4346
4347                if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
4348                        goto loop;
4349
4350                bg_ret = NULL;
4351                ret = do_allocation(block_group, ffe_ctl, &bg_ret);
4352                if (ret == 0) {
4353                        if (bg_ret && bg_ret != block_group) {
4354                                btrfs_release_block_group(block_group,
4355                                                          ffe_ctl->delalloc);
4356                                block_group = bg_ret;
4357                        }
4358                } else if (ret == -EAGAIN) {
4359                        goto have_block_group;
4360                } else if (ret > 0) {
4361                        goto loop;
4362                }
4363
4364                /* Checks */
4365                ffe_ctl->search_start = round_up(ffe_ctl->found_offset,
4366                                                 fs_info->stripesize);
4367
4368                /* move on to the next group */
4369                if (ffe_ctl->search_start + ffe_ctl->num_bytes >
4370                    block_group->start + block_group->length) {
4371                        btrfs_add_free_space_unused(block_group,
4372                                            ffe_ctl->found_offset,
4373                                            ffe_ctl->num_bytes);
4374                        goto loop;
4375                }
4376
4377                if (ffe_ctl->found_offset < ffe_ctl->search_start)
4378                        btrfs_add_free_space_unused(block_group,
4379                                        ffe_ctl->found_offset,
4380                                        ffe_ctl->search_start - ffe_ctl->found_offset);
4381
4382                ret = btrfs_add_reserved_bytes(block_group, ffe_ctl->ram_bytes,
4383                                               ffe_ctl->num_bytes,
4384                                               ffe_ctl->delalloc);
4385                if (ret == -EAGAIN) {
4386                        btrfs_add_free_space_unused(block_group,
4387                                        ffe_ctl->found_offset,
4388                                        ffe_ctl->num_bytes);
4389                        goto loop;
4390                }
4391                btrfs_inc_block_group_reservations(block_group);
4392
4393                /* we are all good, lets return */
4394                ins->objectid = ffe_ctl->search_start;
4395                ins->offset = ffe_ctl->num_bytes;
4396
4397                trace_btrfs_reserve_extent(block_group, ffe_ctl->search_start,
4398                                           ffe_ctl->num_bytes);
4399                btrfs_release_block_group(block_group, ffe_ctl->delalloc);
4400                break;
4401loop:
4402                release_block_group(block_group, ffe_ctl, ffe_ctl->delalloc);
4403                cond_resched();
4404        }
4405        up_read(&space_info->groups_sem);
4406
4407        ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, full_search);
4408        if (ret > 0)
4409                goto search;
4410
4411        if (ret == -ENOSPC && !cache_block_group_error) {
4412                /*
4413                 * Use ffe_ctl->total_free_space as fallback if we can't find
4414                 * any contiguous hole.
4415                 */
4416                if (!ffe_ctl->max_extent_size)
4417                        ffe_ctl->max_extent_size = ffe_ctl->total_free_space;
4418                spin_lock(&space_info->lock);
4419                space_info->max_extent_size = ffe_ctl->max_extent_size;
4420                spin_unlock(&space_info->lock);
4421                ins->offset = ffe_ctl->max_extent_size;
4422        } else if (ret == -ENOSPC) {
4423                ret = cache_block_group_error;
4424        }
4425        return ret;
4426}
4427
4428/*
4429 * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a
4430 *                        hole that is at least as big as @num_bytes.
4431 *
4432 * @root           -    The root that will contain this extent
4433 *
4434 * @ram_bytes      -    The amount of space in ram that @num_bytes take. This
4435 *                      is used for accounting purposes. This value differs
4436 *                      from @num_bytes only in the case of compressed extents.
4437 *
4438 * @num_bytes      -    Number of bytes to allocate on-disk.
4439 *
4440 * @min_alloc_size -    Indicates the minimum amount of space that the
4441 *                      allocator should try to satisfy. In some cases
4442 *                      @num_bytes may be larger than what is required and if
4443 *                      the filesystem is fragmented then allocation fails.
4444 *                      However, the presence of @min_alloc_size gives a
4445 *                      chance to try and satisfy the smaller allocation.
4446 *
4447 * @empty_size     -    A hint that you plan on doing more COW. This is the
4448 *                      size in bytes the allocator should try to find free
4449 *                      next to the block it returns.  This is just a hint and
4450 *                      may be ignored by the allocator.
4451 *
4452 * @hint_byte      -    Hint to the allocator to start searching above the byte
4453 *                      address passed. It might be ignored.
4454 *
4455 * @ins            -    This key is modified to record the found hole. It will
4456 *                      have the following values:
4457 *                      ins->objectid == start position
4458 *                      ins->flags = BTRFS_EXTENT_ITEM_KEY
4459 *                      ins->offset == the size of the hole.
4460 *
4461 * @is_data        -    Boolean flag indicating whether an extent is
4462 *                      allocated for data (true) or metadata (false)
4463 *
4464 * @delalloc       -    Boolean flag indicating whether this allocation is for
4465 *                      delalloc or not. If 'true' data_rwsem of block groups
4466 *                      is going to be acquired.
4467 *
4468 *
4469 * Returns 0 when an allocation succeeded or < 0 when an error occurred. In
4470 * case -ENOSPC is returned then @ins->offset will contain the size of the
4471 * largest available hole the allocator managed to find.
4472 */
4473int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
4474                         u64 num_bytes, u64 min_alloc_size,
4475                         u64 empty_size, u64 hint_byte,
4476                         struct btrfs_key *ins, int is_data, int delalloc)
4477{
4478        struct btrfs_fs_info *fs_info = root->fs_info;
4479        struct find_free_extent_ctl ffe_ctl = {};
4480        bool final_tried = num_bytes == min_alloc_size;
4481        u64 flags;
4482        int ret;
4483        bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
4484        bool for_data_reloc = (btrfs_is_data_reloc_root(root) && is_data);
4485
4486        flags = get_alloc_profile_by_root(root, is_data);
4487again:
4488        WARN_ON(num_bytes < fs_info->sectorsize);
4489
4490        ffe_ctl.ram_bytes = ram_bytes;
4491        ffe_ctl.num_bytes = num_bytes;
4492        ffe_ctl.min_alloc_size = min_alloc_size;
4493        ffe_ctl.empty_size = empty_size;
4494        ffe_ctl.flags = flags;
4495        ffe_ctl.delalloc = delalloc;
4496        ffe_ctl.hint_byte = hint_byte;
4497        ffe_ctl.for_treelog = for_treelog;
4498        ffe_ctl.for_data_reloc = for_data_reloc;
4499
4500        ret = find_free_extent(root, ins, &ffe_ctl);
4501        if (!ret && !is_data) {
4502                btrfs_dec_block_group_reservations(fs_info, ins->objectid);
4503        } else if (ret == -ENOSPC) {
4504                if (!final_tried && ins->offset) {
4505                        num_bytes = min(num_bytes >> 1, ins->offset);
4506                        num_bytes = round_down(num_bytes,
4507                                               fs_info->sectorsize);
4508                        num_bytes = max(num_bytes, min_alloc_size);
4509                        ram_bytes = num_bytes;
4510                        if (num_bytes == min_alloc_size)
4511                                final_tried = true;
4512                        goto again;
4513                } else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
4514                        struct btrfs_space_info *sinfo;
4515
4516                        sinfo = btrfs_find_space_info(fs_info, flags);
4517                        btrfs_err(fs_info,
4518        "allocation failed flags %llu, wanted %llu tree-log %d, relocation: %d",
4519                                  flags, num_bytes, for_treelog, for_data_reloc);
4520                        if (sinfo)
4521                                btrfs_dump_space_info(fs_info, sinfo,
4522                                                      num_bytes, 1);
4523                }
4524        }
4525
4526        return ret;
4527}
4528
4529int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
4530                               u64 start, u64 len, int delalloc)
4531{
4532        struct btrfs_block_group *cache;
4533
4534        cache = btrfs_lookup_block_group(fs_info, start);
4535        if (!cache) {
4536                btrfs_err(fs_info, "Unable to find block group for %llu",
4537                          start);
4538                return -ENOSPC;
4539        }
4540
4541        btrfs_add_free_space(cache, start, len);
4542        btrfs_free_reserved_bytes(cache, len, delalloc);
4543        trace_btrfs_reserved_extent_free(fs_info, start, len);
4544
4545        btrfs_put_block_group(cache);
4546        return 0;
4547}
4548
4549int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start,
4550                              u64 len)
4551{
4552        struct btrfs_block_group *cache;
4553        int ret = 0;
4554
4555        cache = btrfs_lookup_block_group(trans->fs_info, start);
4556        if (!cache) {
4557                btrfs_err(trans->fs_info, "unable to find block group for %llu",
4558                          start);
4559                return -ENOSPC;
4560        }
4561
4562        ret = pin_down_extent(trans, cache, start, len, 1);
4563        btrfs_put_block_group(cache);
4564        return ret;
4565}
4566
4567static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
4568                                      u64 parent, u64 root_objectid,
4569                                      u64 flags, u64 owner, u64 offset,
4570                                      struct btrfs_key *ins, int ref_mod)
4571{
4572        struct btrfs_fs_info *fs_info = trans->fs_info;
4573        int ret;
4574        struct btrfs_extent_item *extent_item;
4575        struct btrfs_extent_inline_ref *iref;
4576        struct btrfs_path *path;
4577        struct extent_buffer *leaf;
4578        int type;
4579        u32 size;
4580
4581        if (parent > 0)
4582                type = BTRFS_SHARED_DATA_REF_KEY;
4583        else
4584                type = BTRFS_EXTENT_DATA_REF_KEY;
4585
4586        size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
4587
4588        path = btrfs_alloc_path();
4589        if (!path)
4590                return -ENOMEM;
4591
4592        ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
4593                                      ins, size);
4594        if (ret) {
4595                btrfs_free_path(path);
4596                return ret;
4597        }
4598
4599        leaf = path->nodes[0];
4600        extent_item = btrfs_item_ptr(leaf, path->slots[0],
4601                                     struct btrfs_extent_item);
4602        btrfs_set_extent_refs(leaf, extent_item, ref_mod);
4603        btrfs_set_extent_generation(leaf, extent_item, trans->transid);
4604        btrfs_set_extent_flags(leaf, extent_item,
4605                               flags | BTRFS_EXTENT_FLAG_DATA);
4606
4607        iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
4608        btrfs_set_extent_inline_ref_type(leaf, iref, type);
4609        if (parent > 0) {
4610                struct btrfs_shared_data_ref *ref;
4611                ref = (struct btrfs_shared_data_ref *)(iref + 1);
4612                btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
4613                btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
4614        } else {
4615                struct btrfs_extent_data_ref *ref;
4616                ref = (struct btrfs_extent_data_ref *)(&iref->offset);
4617                btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
4618                btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
4619                btrfs_set_extent_data_ref_offset(leaf, ref, offset);
4620                btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
4621        }
4622
4623        btrfs_mark_buffer_dirty(path->nodes[0]);
4624        btrfs_free_path(path);
4625
4626        ret = remove_from_free_space_tree(trans, ins->objectid, ins->offset);
4627        if (ret)
4628                return ret;
4629
4630        ret = btrfs_update_block_group(trans, ins->objectid, ins->offset, true);
4631        if (ret) { /* -ENOENT, logic error */
4632                btrfs_err(fs_info, "update block group failed for %llu %llu",
4633                        ins->objectid, ins->offset);
4634                BUG();
4635        }
4636        trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, ins->offset);
4637        return ret;
4638}
4639
4640static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
4641                                     struct btrfs_delayed_ref_node *node,
4642                                     struct btrfs_delayed_extent_op *extent_op)
4643{
4644        struct btrfs_fs_info *fs_info = trans->fs_info;
4645        int ret;
4646        struct btrfs_extent_item *extent_item;
4647        struct btrfs_key extent_key;
4648        struct btrfs_tree_block_info *block_info;
4649        struct btrfs_extent_inline_ref *iref;
4650        struct btrfs_path *path;
4651        struct extent_buffer *leaf;
4652        struct btrfs_delayed_tree_ref *ref;
4653        u32 size = sizeof(*extent_item) + sizeof(*iref);
4654        u64 num_bytes;
4655        u64 flags = extent_op->flags_to_set;
4656        bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
4657
4658        ref = btrfs_delayed_node_to_tree_ref(node);
4659
4660        extent_key.objectid = node->bytenr;
4661        if (skinny_metadata) {
4662                extent_key.offset = ref->level;
4663                extent_key.type = BTRFS_METADATA_ITEM_KEY;
4664                num_bytes = fs_info->nodesize;
4665        } else {
4666                extent_key.offset = node->num_bytes;
4667                extent_key.type = BTRFS_EXTENT_ITEM_KEY;
4668                size += sizeof(*block_info);
4669                num_bytes = node->num_bytes;
4670        }
4671
4672        path = btrfs_alloc_path();
4673        if (!path)
4674                return -ENOMEM;
4675
4676        ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
4677                                      &extent_key, size);
4678        if (ret) {
4679                btrfs_free_path(path);
4680                return ret;
4681        }
4682
4683        leaf = path->nodes[0];
4684        extent_item = btrfs_item_ptr(leaf, path->slots[0],
4685                                     struct btrfs_extent_item);
4686        btrfs_set_extent_refs(leaf, extent_item, 1);
4687        btrfs_set_extent_generation(leaf, extent_item, trans->transid);
4688        btrfs_set_extent_flags(leaf, extent_item,
4689                               flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
4690
4691        if (skinny_metadata) {
4692                iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
4693        } else {
4694                block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
4695                btrfs_set_tree_block_key(leaf, block_info, &extent_op->key);
4696                btrfs_set_tree_block_level(leaf, block_info, ref->level);
4697                iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
4698        }
4699
4700        if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
4701                btrfs_set_extent_inline_ref_type(leaf, iref,
4702                                                 BTRFS_SHARED_BLOCK_REF_KEY);
4703                btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent);
4704        } else {
4705                btrfs_set_extent_inline_ref_type(leaf, iref,
4706                                                 BTRFS_TREE_BLOCK_REF_KEY);
4707                btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root);
4708        }
4709
4710        btrfs_mark_buffer_dirty(leaf);
4711        btrfs_free_path(path);
4712
4713        ret = remove_from_free_space_tree(trans, extent_key.objectid,
4714                                          num_bytes);
4715        if (ret)
4716                return ret;
4717
4718        ret = btrfs_update_block_group(trans, extent_key.objectid,
4719                                       fs_info->nodesize, true);
4720        if (ret) { /* -ENOENT, logic error */
4721                btrfs_err(fs_info, "update block group failed for %llu %llu",
4722                        extent_key.objectid, extent_key.offset);
4723                BUG();
4724        }
4725
4726        trace_btrfs_reserved_extent_alloc(fs_info, extent_key.objectid,
4727                                          fs_info->nodesize);
4728        return ret;
4729}
4730
4731int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
4732                                     struct btrfs_root *root, u64 owner,
4733                                     u64 offset, u64 ram_bytes,
4734                                     struct btrfs_key *ins)
4735{
4736        struct btrfs_ref generic_ref = { 0 };
4737
4738        BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
4739
4740        btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
4741                               ins->objectid, ins->offset, 0);
4742        btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner,
4743                            offset, 0, false);
4744        btrfs_ref_tree_mod(root->fs_info, &generic_ref);
4745
4746        return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes);
4747}
4748
4749/*
4750 * this is used by the tree logging recovery code.  It records that
4751 * an extent has been allocated and makes sure to clear the free
4752 * space cache bits as well
4753 */
4754int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
4755                                   u64 root_objectid, u64 owner, u64 offset,
4756                                   struct btrfs_key *ins)
4757{
4758        struct btrfs_fs_info *fs_info = trans->fs_info;
4759        int ret;
4760        struct btrfs_block_group *block_group;
4761        struct btrfs_space_info *space_info;
4762
4763        /*
4764         * Mixed block groups will exclude before processing the log so we only
4765         * need to do the exclude dance if this fs isn't mixed.
4766         */
4767        if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
4768                ret = __exclude_logged_extent(fs_info, ins->objectid,
4769                                              ins->offset);
4770                if (ret)
4771                        return ret;
4772        }
4773
4774        block_group = btrfs_lookup_block_group(fs_info, ins->objectid);
4775        if (!block_group)
4776                return -EINVAL;
4777
4778        space_info = block_group->space_info;
4779        spin_lock(&space_info->lock);
4780        spin_lock(&block_group->lock);
4781        space_info->bytes_reserved += ins->offset;
4782        block_group->reserved += ins->offset;
4783        spin_unlock(&block_group->lock);
4784        spin_unlock(&space_info->lock);
4785
4786        ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
4787                                         offset, ins, 1);
4788        if (ret)
4789                btrfs_pin_extent(trans, ins->objectid, ins->offset, 1);
4790        btrfs_put_block_group(block_group);
4791        return ret;
4792}
4793
4794static struct extent_buffer *
4795btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4796                      u64 bytenr, int level, u64 owner,
4797                      enum btrfs_lock_nesting nest)
4798{
4799        struct btrfs_fs_info *fs_info = root->fs_info;
4800        struct extent_buffer *buf;
4801
4802        buf = btrfs_find_create_tree_block(fs_info, bytenr, owner, level);
4803        if (IS_ERR(buf))
4804                return buf;
4805
4806        /*
4807         * Extra safety check in case the extent tree is corrupted and extent
4808         * allocator chooses to use a tree block which is already used and
4809         * locked.
4810         */
4811        if (buf->lock_owner == current->pid) {
4812                btrfs_err_rl(fs_info,
4813"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
4814                        buf->start, btrfs_header_owner(buf), current->pid);
4815                free_extent_buffer(buf);
4816                return ERR_PTR(-EUCLEAN);
4817        }
4818
4819        /*
4820         * This needs to stay, because we could allocate a freed block from an
4821         * old tree into a new tree, so we need to make sure this new block is
4822         * set to the appropriate level and owner.
4823         */
4824        btrfs_set_buffer_lockdep_class(owner, buf, level);
4825        __btrfs_tree_lock(buf, nest);
4826        btrfs_clean_tree_block(buf);
4827        clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
4828        clear_bit(EXTENT_BUFFER_NO_CHECK, &buf->bflags);
4829
4830        set_extent_buffer_uptodate(buf);
4831
4832        memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
4833        btrfs_set_header_level(buf, level);
4834        btrfs_set_header_bytenr(buf, buf->start);
4835        btrfs_set_header_generation(buf, trans->transid);
4836        btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV);
4837        btrfs_set_header_owner(buf, owner);
4838        write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid);
4839        write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid);
4840        if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
4841                buf->log_index = root->log_transid % 2;
4842                /*
4843                 * we allow two log transactions at a time, use different
4844                 * EXTENT bit to differentiate dirty pages.
4845                 */
4846                if (buf->log_index == 0)
4847                        set_extent_dirty(&root->dirty_log_pages, buf->start,
4848                                        buf->start + buf->len - 1, GFP_NOFS);
4849                else
4850                        set_extent_new(&root->dirty_log_pages, buf->start,
4851                                        buf->start + buf->len - 1);
4852        } else {
4853                buf->log_index = -1;
4854                set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
4855                         buf->start + buf->len - 1, GFP_NOFS);
4856        }
4857        /* this returns a buffer locked for blocking */
4858        return buf;
4859}
4860
4861/*
4862 * finds a free extent and does all the dirty work required for allocation
4863 * returns the tree buffer or an ERR_PTR on error.
4864 */
4865struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
4866                                             struct btrfs_root *root,
4867                                             u64 parent, u64 root_objectid,
4868                                             const struct btrfs_disk_key *key,
4869                                             int level, u64 hint,
4870                                             u64 empty_size,
4871                                             enum btrfs_lock_nesting nest)
4872{
4873        struct btrfs_fs_info *fs_info = root->fs_info;
4874        struct btrfs_key ins;
4875        struct btrfs_block_rsv *block_rsv;
4876        struct extent_buffer *buf;
4877        struct btrfs_delayed_extent_op *extent_op;
4878        struct btrfs_ref generic_ref = { 0 };
4879        u64 flags = 0;
4880        int ret;
4881        u32 blocksize = fs_info->nodesize;
4882        bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
4883
4884#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
4885        if (btrfs_is_testing(fs_info)) {
4886                buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
4887                                            level, root_objectid, nest);
4888                if (!IS_ERR(buf))
4889                        root->alloc_bytenr += blocksize;
4890                return buf;
4891        }
4892#endif
4893
4894        block_rsv = btrfs_use_block_rsv(trans, root, blocksize);
4895        if (IS_ERR(block_rsv))
4896                return ERR_CAST(block_rsv);
4897
4898        ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
4899                                   empty_size, hint, &ins, 0, 0);
4900        if (ret)
4901                goto out_unuse;
4902
4903        buf = btrfs_init_new_buffer(trans, root, ins.objectid, level,
4904                                    root_objectid, nest);
4905        if (IS_ERR(buf)) {
4906                ret = PTR_ERR(buf);
4907                goto out_free_reserved;
4908        }
4909
4910        if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
4911                if (parent == 0)
4912                        parent = ins.objectid;
4913                flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
4914        } else
4915                BUG_ON(parent > 0);
4916
4917        if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
4918                extent_op = btrfs_alloc_delayed_extent_op();
4919                if (!extent_op) {
4920                        ret = -ENOMEM;
4921                        goto out_free_buf;
4922                }
4923                if (key)
4924                        memcpy(&extent_op->key, key, sizeof(extent_op->key));
4925                else
4926                        memset(&extent_op->key, 0, sizeof(extent_op->key));
4927                extent_op->flags_to_set = flags;
4928                extent_op->update_key = skinny_metadata ? false : true;
4929                extent_op->update_flags = true;
4930                extent_op->is_data = false;
4931                extent_op->level = level;
4932
4933                btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
4934                                       ins.objectid, ins.offset, parent);
4935                btrfs_init_tree_ref(&generic_ref, level, root_objectid,
4936                                    root->root_key.objectid, false);
4937                btrfs_ref_tree_mod(fs_info, &generic_ref);
4938                ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op);
4939                if (ret)
4940                        goto out_free_delayed;
4941        }
4942        return buf;
4943
4944out_free_delayed:
4945        btrfs_free_delayed_extent_op(extent_op);
4946out_free_buf:
4947        btrfs_tree_unlock(buf);
4948        free_extent_buffer(buf);
4949out_free_reserved:
4950        btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
4951out_unuse:
4952        btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize);
4953        return ERR_PTR(ret);
4954}
4955
4956struct walk_control {
4957        u64 refs[BTRFS_MAX_LEVEL];
4958        u64 flags[BTRFS_MAX_LEVEL];
4959        struct btrfs_key update_progress;
4960        struct btrfs_key drop_progress;
4961        int drop_level;
4962        int stage;
4963        int level;
4964        int shared_level;
4965        int update_ref;
4966        int keep_locks;
4967        int reada_slot;
4968        int reada_count;
4969        int restarted;
4970};
4971
4972#define DROP_REFERENCE  1
4973#define UPDATE_BACKREF  2
4974
4975static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
4976                                     struct btrfs_root *root,
4977                                     struct walk_control *wc,
4978                                     struct btrfs_path *path)
4979{
4980        struct btrfs_fs_info *fs_info = root->fs_info;
4981        u64 bytenr;
4982        u64 generation;
4983        u64 refs;
4984        u64 flags;
4985        u32 nritems;
4986        struct btrfs_key key;
4987        struct extent_buffer *eb;
4988        int ret;
4989        int slot;
4990        int nread = 0;
4991
4992        if (path->slots[wc->level] < wc->reada_slot) {
4993                wc->reada_count = wc->reada_count * 2 / 3;
4994                wc->reada_count = max(wc->reada_count, 2);
4995        } else {
4996                wc->reada_count = wc->reada_count * 3 / 2;
4997                wc->reada_count = min_t(int, wc->reada_count,
4998                                        BTRFS_NODEPTRS_PER_BLOCK(fs_info));
4999        }
5000
5001        eb = path->nodes[wc->level];
5002        nritems = btrfs_header_nritems(eb);
5003
5004        for (slot = path->slots[wc->level]; slot < nritems; slot++) {
5005                if (nread >= wc->reada_count)
5006                        break;
5007
5008                cond_resched();
5009                bytenr = btrfs_node_blockptr(eb, slot);
5010                generation = btrfs_node_ptr_generation(eb, slot);
5011
5012                if (slot == path->slots[wc->level])
5013                        goto reada;
5014
5015                if (wc->stage == UPDATE_BACKREF &&
5016                    generation <= root->root_key.offset)
5017                        continue;
5018
5019                /* We don't lock the tree block, it's OK to be racy here */
5020                ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
5021                                               wc->level - 1, 1, &refs,
5022                                               &flags);
5023                /* We don't care about errors in readahead. */
5024                if (ret < 0)
5025                        continue;
5026                BUG_ON(refs == 0);
5027
5028                if (wc->stage == DROP_REFERENCE) {
5029                        if (refs == 1)
5030                                goto reada;
5031
5032                        if (wc->level == 1 &&
5033                            (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5034                                continue;
5035                        if (!wc->update_ref ||
5036                            generation <= root->root_key.offset)
5037                                continue;
5038                        btrfs_node_key_to_cpu(eb, &key, slot);
5039                        ret = btrfs_comp_cpu_keys(&key,
5040                                                  &wc->update_progress);
5041                        if (ret < 0)
5042                                continue;
5043                } else {
5044                        if (wc->level == 1 &&
5045                            (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5046                                continue;
5047                }
5048reada:
5049                btrfs_readahead_node_child(eb, slot);
5050                nread++;
5051        }
5052        wc->reada_slot = slot;
5053}
5054
5055/*
5056 * helper to process tree block while walking down the tree.
5057 *
5058 * when wc->stage == UPDATE_BACKREF, this function updates
5059 * back refs for pointers in the block.
5060 *
5061 * NOTE: return value 1 means we should stop walking down.
5062 */
5063static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
5064                                   struct btrfs_root *root,
5065                                   struct btrfs_path *path,
5066                                   struct walk_control *wc, int lookup_info)
5067{
5068        struct btrfs_fs_info *fs_info = root->fs_info;
5069        int level = wc->level;
5070        struct extent_buffer *eb = path->nodes[level];
5071        u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
5072        int ret;
5073
5074        if (wc->stage == UPDATE_BACKREF &&
5075            btrfs_header_owner(eb) != root->root_key.objectid)
5076                return 1;
5077
5078        /*
5079         * when reference count of tree block is 1, it won't increase
5080         * again. once full backref flag is set, we never clear it.
5081         */
5082        if (lookup_info &&
5083            ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
5084             (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
5085                BUG_ON(!path->locks[level]);
5086                ret = btrfs_lookup_extent_info(trans, fs_info,
5087                                               eb->start, level, 1,
5088                                               &wc->refs[level],
5089                                               &wc->flags[level]);
5090                BUG_ON(ret == -ENOMEM);
5091                if (ret)
5092                        return ret;
5093                BUG_ON(wc->refs[level] == 0);
5094        }
5095
5096        if (wc->stage == DROP_REFERENCE) {
5097                if (wc->refs[level] > 1)
5098                        return 1;
5099
5100                if (path->locks[level] && !wc->keep_locks) {
5101                        btrfs_tree_unlock_rw(eb, path->locks[level]);
5102                        path->locks[level] = 0;
5103                }
5104                return 0;
5105        }
5106
5107        /* wc->stage == UPDATE_BACKREF */
5108        if (!(wc->flags[level] & flag)) {
5109                BUG_ON(!path->locks[level]);
5110                ret = btrfs_inc_ref(trans, root, eb, 1);
5111                BUG_ON(ret); /* -ENOMEM */
5112                ret = btrfs_dec_ref(trans, root, eb, 0);
5113                BUG_ON(ret); /* -ENOMEM */
5114                ret = btrfs_set_disk_extent_flags(trans, eb, flag,
5115                                                  btrfs_header_level(eb), 0);
5116                BUG_ON(ret); /* -ENOMEM */
5117                wc->flags[level] |= flag;
5118        }
5119
5120        /*
5121         * the block is shared by multiple trees, so it's not good to
5122         * keep the tree lock
5123         */
5124        if (path->locks[level] && level > 0) {
5125                btrfs_tree_unlock_rw(eb, path->locks[level]);
5126                path->locks[level] = 0;
5127        }
5128        return 0;
5129}
5130
5131/*
5132 * This is used to verify a ref exists for this root to deal with a bug where we
5133 * would have a drop_progress key that hadn't been updated properly.
5134 */
5135static int check_ref_exists(struct btrfs_trans_handle *trans,
5136                            struct btrfs_root *root, u64 bytenr, u64 parent,
5137                            int level)
5138{
5139        struct btrfs_path *path;
5140        struct btrfs_extent_inline_ref *iref;
5141        int ret;
5142
5143        path = btrfs_alloc_path();
5144        if (!path)
5145                return -ENOMEM;
5146
5147        ret = lookup_extent_backref(trans, path, &iref, bytenr,
5148                                    root->fs_info->nodesize, parent,
5149                                    root->root_key.objectid, level, 0);
5150        btrfs_free_path(path);
5151        if (ret == -ENOENT)
5152                return 0;
5153        if (ret < 0)
5154                return ret;
5155        return 1;
5156}
5157
5158/*
5159 * helper to process tree block pointer.
5160 *
5161 * when wc->stage == DROP_REFERENCE, this function checks
5162 * reference count of the block pointed to. if the block
5163 * is shared and we need update back refs for the subtree
5164 * rooted at the block, this function changes wc->stage to
5165 * UPDATE_BACKREF. if the block is shared and there is no
5166 * need to update back, this function drops the reference
5167 * to the block.
5168 *
5169 * NOTE: return value 1 means we should stop walking down.
5170 */
5171static noinline int do_walk_down(struct btrfs_trans_handle *trans,
5172                                 struct btrfs_root *root,
5173                                 struct btrfs_path *path,
5174                                 struct walk_control *wc, int *lookup_info)
5175{
5176        struct btrfs_fs_info *fs_info = root->fs_info;
5177        u64 bytenr;
5178        u64 generation;
5179        u64 parent;
5180        struct btrfs_key key;
5181        struct btrfs_key first_key;
5182        struct btrfs_ref ref = { 0 };
5183        struct extent_buffer *next;
5184        int level = wc->level;
5185        int reada = 0;
5186        int ret = 0;
5187        bool need_account = false;
5188
5189        generation = btrfs_node_ptr_generation(path->nodes[level],
5190                                               path->slots[level]);
5191        /*
5192         * if the lower level block was created before the snapshot
5193         * was created, we know there is no need to update back refs
5194         * for the subtree
5195         */
5196        if (wc->stage == UPDATE_BACKREF &&
5197            generation <= root->root_key.offset) {
5198                *lookup_info = 1;
5199                return 1;
5200        }
5201
5202        bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
5203        btrfs_node_key_to_cpu(path->nodes[level], &first_key,
5204                              path->slots[level]);
5205
5206        next = find_extent_buffer(fs_info, bytenr);
5207        if (!next) {
5208                next = btrfs_find_create_tree_block(fs_info, bytenr,
5209                                root->root_key.objectid, level - 1);
5210                if (IS_ERR(next))
5211                        return PTR_ERR(next);
5212                reada = 1;
5213        }
5214        btrfs_tree_lock(next);
5215
5216        ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
5217                                       &wc->refs[level - 1],
5218                                       &wc->flags[level - 1]);
5219        if (ret < 0)
5220                goto out_unlock;
5221
5222        if (unlikely(wc->refs[level - 1] == 0)) {
5223                btrfs_err(fs_info, "Missing references.");
5224                ret = -EIO;
5225                goto out_unlock;
5226        }
5227        *lookup_info = 0;
5228
5229        if (wc->stage == DROP_REFERENCE) {
5230                if (wc->refs[level - 1] > 1) {
5231                        need_account = true;
5232                        if (level == 1 &&
5233                            (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5234                                goto skip;
5235
5236                        if (!wc->update_ref ||
5237                            generation <= root->root_key.offset)
5238                                goto skip;
5239
5240                        btrfs_node_key_to_cpu(path->nodes[level], &key,
5241                                              path->slots[level]);
5242                        ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
5243                        if (ret < 0)
5244                                goto skip;
5245
5246                        wc->stage = UPDATE_BACKREF;
5247                        wc->shared_level = level - 1;
5248                }
5249        } else {
5250                if (level == 1 &&
5251                    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5252                        goto skip;
5253        }
5254
5255        if (!btrfs_buffer_uptodate(next, generation, 0)) {
5256                btrfs_tree_unlock(next);
5257                free_extent_buffer(next);
5258                next = NULL;
5259                *lookup_info = 1;
5260        }
5261
5262        if (!next) {
5263                if (reada && level == 1)
5264                        reada_walk_down(trans, root, wc, path);
5265                next = read_tree_block(fs_info, bytenr, root->root_key.objectid,
5266                                       generation, level - 1, &first_key);
5267                if (IS_ERR(next)) {
5268                        return PTR_ERR(next);
5269                } else if (!extent_buffer_uptodate(next)) {
5270                        free_extent_buffer(next);
5271                        return -EIO;
5272                }
5273                btrfs_tree_lock(next);
5274        }
5275
5276        level--;
5277        ASSERT(level == btrfs_header_level(next));
5278        if (level != btrfs_header_level(next)) {
5279                btrfs_err(root->fs_info, "mismatched level");
5280                ret = -EIO;
5281                goto out_unlock;
5282        }
5283        path->nodes[level] = next;
5284        path->slots[level] = 0;
5285        path->locks[level] = BTRFS_WRITE_LOCK;
5286        wc->level = level;
5287        if (wc->level == 1)
5288                wc->reada_slot = 0;
5289        return 0;
5290skip:
5291        wc->refs[level - 1] = 0;
5292        wc->flags[level - 1] = 0;
5293        if (wc->stage == DROP_REFERENCE) {
5294                if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
5295                        parent = path->nodes[level]->start;
5296                } else {
5297                        ASSERT(root->root_key.objectid ==
5298                               btrfs_header_owner(path->nodes[level]));
5299                        if (root->root_key.objectid !=
5300                            btrfs_header_owner(path->nodes[level])) {
5301                                btrfs_err(root->fs_info,
5302                                                "mismatched block owner");
5303                                ret = -EIO;
5304                                goto out_unlock;
5305                        }
5306                        parent = 0;
5307                }
5308
5309                /*
5310                 * If we had a drop_progress we need to verify the refs are set
5311                 * as expected.  If we find our ref then we know that from here
5312                 * on out everything should be correct, and we can clear the
5313                 * ->restarted flag.
5314                 */
5315                if (wc->restarted) {
5316                        ret = check_ref_exists(trans, root, bytenr, parent,
5317                                               level - 1);
5318                        if (ret < 0)
5319                                goto out_unlock;
5320                        if (ret == 0)
5321                                goto no_delete;
5322                        ret = 0;
5323                        wc->restarted = 0;
5324                }
5325
5326                /*
5327                 * Reloc tree doesn't contribute to qgroup numbers, and we have
5328                 * already accounted them at merge time (replace_path),
5329                 * thus we could skip expensive subtree trace here.
5330                 */
5331                if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
5332                    need_account) {
5333                        ret = btrfs_qgroup_trace_subtree(trans, next,
5334                                                         generation, level - 1);
5335                        if (ret) {
5336                                btrfs_err_rl(fs_info,
5337                                             "Error %d accounting shared subtree. Quota is out of sync, rescan required.",
5338                                             ret);
5339                        }
5340                }
5341
5342                /*
5343                 * We need to update the next key in our walk control so we can
5344                 * update the drop_progress key accordingly.  We don't care if
5345                 * find_next_key doesn't find a key because that means we're at
5346                 * the end and are going to clean up now.
5347                 */
5348                wc->drop_level = level;
5349                find_next_key(path, level, &wc->drop_progress);
5350
5351                btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
5352                                       fs_info->nodesize, parent);
5353                btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid,
5354                                    0, false);
5355                ret = btrfs_free_extent(trans, &ref);
5356                if (ret)
5357                        goto out_unlock;
5358        }
5359no_delete:
5360        *lookup_info = 1;
5361        ret = 1;
5362
5363out_unlock:
5364        btrfs_tree_unlock(next);
5365        free_extent_buffer(next);
5366
5367        return ret;
5368}
5369
5370/*
5371 * helper to process tree block while walking up the tree.
5372 *
5373 * when wc->stage == DROP_REFERENCE, this function drops
5374 * reference count on the block.
5375 *
5376 * when wc->stage == UPDATE_BACKREF, this function changes
5377 * wc->stage back to DROP_REFERENCE if we changed wc->stage
5378 * to UPDATE_BACKREF previously while processing the block.
5379 *
5380 * NOTE: return value 1 means we should stop walking up.
5381 */
5382static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
5383                                 struct btrfs_root *root,
5384                                 struct btrfs_path *path,
5385                                 struct walk_control *wc)
5386{
5387        struct btrfs_fs_info *fs_info = root->fs_info;
5388        int ret;
5389        int level = wc->level;
5390        struct extent_buffer *eb = path->nodes[level];
5391        u64 parent = 0;
5392
5393        if (wc->stage == UPDATE_BACKREF) {
5394                BUG_ON(wc->shared_level < level);
5395                if (level < wc->shared_level)
5396                        goto out;
5397
5398                ret = find_next_key(path, level + 1, &wc->update_progress);
5399                if (ret > 0)
5400                        wc->update_ref = 0;
5401
5402                wc->stage = DROP_REFERENCE;
5403                wc->shared_level = -1;
5404                path->slots[level] = 0;
5405
5406                /*
5407                 * check reference count again if the block isn't locked.
5408                 * we should start walking down the tree again if reference
5409                 * count is one.
5410                 */
5411                if (!path->locks[level]) {
5412                        BUG_ON(level == 0);
5413                        btrfs_tree_lock(eb);
5414                        path->locks[level] = BTRFS_WRITE_LOCK;
5415
5416                        ret = btrfs_lookup_extent_info(trans, fs_info,
5417                                                       eb->start, level, 1,
5418                                                       &wc->refs[level],
5419                                                       &wc->flags[level]);
5420                        if (ret < 0) {
5421                                btrfs_tree_unlock_rw(eb, path->locks[level]);
5422                                path->locks[level] = 0;
5423                                return ret;
5424                        }
5425                        BUG_ON(wc->refs[level] == 0);
5426                        if (wc->refs[level] == 1) {
5427                                btrfs_tree_unlock_rw(eb, path->locks[level]);
5428                                path->locks[level] = 0;
5429                                return 1;
5430                        }
5431                }
5432        }
5433
5434        /* wc->stage == DROP_REFERENCE */
5435        BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
5436
5437        if (wc->refs[level] == 1) {
5438                if (level == 0) {
5439                        if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5440                                ret = btrfs_dec_ref(trans, root, eb, 1);
5441                        else
5442                                ret = btrfs_dec_ref(trans, root, eb, 0);
5443                        BUG_ON(ret); /* -ENOMEM */
5444                        if (is_fstree(root->root_key.objectid)) {
5445                                ret = btrfs_qgroup_trace_leaf_items(trans, eb);
5446                                if (ret) {
5447                                        btrfs_err_rl(fs_info,
5448        "error %d accounting leaf items, quota is out of sync, rescan required",
5449                                             ret);
5450                                }
5451                        }
5452                }
5453                /* make block locked assertion in btrfs_clean_tree_block happy */
5454                if (!path->locks[level] &&
5455                    btrfs_header_generation(eb) == trans->transid) {
5456                        btrfs_tree_lock(eb);
5457                        path->locks[level] = BTRFS_WRITE_LOCK;
5458                }
5459                btrfs_clean_tree_block(eb);
5460        }
5461
5462        if (eb == root->node) {
5463                if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5464                        parent = eb->start;
5465                else if (root->root_key.objectid != btrfs_header_owner(eb))
5466                        goto owner_mismatch;
5467        } else {
5468                if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5469                        parent = path->nodes[level + 1]->start;
5470                else if (root->root_key.objectid !=
5471                         btrfs_header_owner(path->nodes[level + 1]))
5472                        goto owner_mismatch;
5473        }
5474
5475        btrfs_free_tree_block(trans, btrfs_root_id(root), eb, parent,
5476                              wc->refs[level] == 1);
5477out:
5478        wc->refs[level] = 0;
5479        wc->flags[level] = 0;
5480        return 0;
5481
5482owner_mismatch:
5483        btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu",
5484                     btrfs_header_owner(eb), root->root_key.objectid);
5485        return -EUCLEAN;
5486}
5487
5488static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
5489                                   struct btrfs_root *root,
5490                                   struct btrfs_path *path,
5491                                   struct walk_control *wc)
5492{
5493        int level = wc->level;
5494        int lookup_info = 1;
5495        int ret;
5496
5497        while (level >= 0) {
5498                ret = walk_down_proc(trans, root, path, wc, lookup_info);
5499                if (ret > 0)
5500                        break;
5501
5502                if (level == 0)
5503                        break;
5504
5505                if (path->slots[level] >=
5506                    btrfs_header_nritems(path->nodes[level]))
5507                        break;
5508
5509                ret = do_walk_down(trans, root, path, wc, &lookup_info);
5510                if (ret > 0) {
5511                        path->slots[level]++;
5512                        continue;
5513                } else if (ret < 0)
5514                        return ret;
5515                level = wc->level;
5516        }
5517        return 0;
5518}
5519
5520static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
5521                                 struct btrfs_root *root,
5522                                 struct btrfs_path *path,
5523                                 struct walk_control *wc, int max_level)
5524{
5525        int level = wc->level;
5526        int ret;
5527
5528        path->slots[level] = btrfs_header_nritems(path->nodes[level]);
5529        while (level < max_level && path->nodes[level]) {
5530                wc->level = level;
5531                if (path->slots[level] + 1 <
5532                    btrfs_header_nritems(path->nodes[level])) {
5533                        path->slots[level]++;
5534                        return 0;
5535                } else {
5536                        ret = walk_up_proc(trans, root, path, wc);
5537                        if (ret > 0)
5538                                return 0;
5539                        if (ret < 0)
5540                                return ret;
5541
5542                        if (path->locks[level]) {
5543                                btrfs_tree_unlock_rw(path->nodes[level],
5544                                                     path->locks[level]);
5545                                path->locks[level] = 0;
5546                        }
5547                        free_extent_buffer(path->nodes[level]);
5548                        path->nodes[level] = NULL;
5549                        level++;
5550                }
5551        }
5552        return 1;
5553}
5554
5555/*
5556 * drop a subvolume tree.
5557 *
5558 * this function traverses the tree freeing any blocks that only
5559 * referenced by the tree.
5560 *
5561 * when a shared tree block is found. this function decreases its
5562 * reference count by one. if update_ref is true, this function
5563 * also make sure backrefs for the shared block and all lower level
5564 * blocks are properly updated.
5565 *
5566 * If called with for_reloc == 0, may exit early with -EAGAIN
5567 */
5568int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
5569{
5570        struct btrfs_fs_info *fs_info = root->fs_info;
5571        struct btrfs_path *path;
5572        struct btrfs_trans_handle *trans;
5573        struct btrfs_root *tree_root = fs_info->tree_root;
5574        struct btrfs_root_item *root_item = &root->root_item;
5575        struct walk_control *wc;
5576        struct btrfs_key key;
5577        int err = 0;
5578        int ret;
5579        int level;
5580        bool root_dropped = false;
5581
5582        btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);
5583
5584        path = btrfs_alloc_path();
5585        if (!path) {
5586                err = -ENOMEM;
5587                goto out;
5588        }
5589
5590        wc = kzalloc(sizeof(*wc), GFP_NOFS);
5591        if (!wc) {
5592                btrfs_free_path(path);
5593                err = -ENOMEM;
5594                goto out;
5595        }
5596
5597        /*
5598         * Use join to avoid potential EINTR from transaction start. See
5599         * wait_reserve_ticket and the whole reservation callchain.
5600         */
5601        if (for_reloc)
5602                trans = btrfs_join_transaction(tree_root);
5603        else
5604                trans = btrfs_start_transaction(tree_root, 0);
5605        if (IS_ERR(trans)) {
5606                err = PTR_ERR(trans);
5607                goto out_free;
5608        }
5609
5610        err = btrfs_run_delayed_items(trans);
5611        if (err)
5612                goto out_end_trans;
5613
5614        /*
5615         * This will help us catch people modifying the fs tree while we're
5616         * dropping it.  It is unsafe to mess with the fs tree while it's being
5617         * dropped as we unlock the root node and parent nodes as we walk down
5618         * the tree, assuming nothing will change.  If something does change
5619         * then we'll have stale information and drop references to blocks we've
5620         * already dropped.
5621         */
5622        set_bit(BTRFS_ROOT_DELETING, &root->state);
5623        if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
5624                level = btrfs_header_level(root->node);
5625                path->nodes[level] = btrfs_lock_root_node(root);
5626                path->slots[level] = 0;
5627                path->locks[level] = BTRFS_WRITE_LOCK;
5628                memset(&wc->update_progress, 0,
5629                       sizeof(wc->update_progress));
5630        } else {
5631                btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
5632                memcpy(&wc->update_progress, &key,
5633                       sizeof(wc->update_progress));
5634
5635                level = btrfs_root_drop_level(root_item);
5636                BUG_ON(level == 0);
5637                path->lowest_level = level;
5638                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5639                path->lowest_level = 0;
5640                if (ret < 0) {
5641                        err = ret;
5642                        goto out_end_trans;
5643                }
5644                WARN_ON(ret > 0);
5645
5646                /*
5647                 * unlock our path, this is safe because only this
5648                 * function is allowed to delete this snapshot
5649                 */
5650                btrfs_unlock_up_safe(path, 0);
5651
5652                level = btrfs_header_level(root->node);
5653                while (1) {
5654                        btrfs_tree_lock(path->nodes[level]);
5655                        path->locks[level] = BTRFS_WRITE_LOCK;
5656
5657                        ret = btrfs_lookup_extent_info(trans, fs_info,
5658                                                path->nodes[level]->start,
5659                                                level, 1, &wc->refs[level],
5660                                                &wc->flags[level]);
5661                        if (ret < 0) {
5662                                err = ret;
5663                                goto out_end_trans;
5664                        }
5665                        BUG_ON(wc->refs[level] == 0);
5666
5667                        if (level == btrfs_root_drop_level(root_item))
5668                                break;
5669
5670                        btrfs_tree_unlock(path->nodes[level]);
5671                        path->locks[level] = 0;
5672                        WARN_ON(wc->refs[level] != 1);
5673                        level--;
5674                }
5675        }
5676
5677        wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
5678        wc->level = level;
5679        wc->shared_level = -1;
5680        wc->stage = DROP_REFERENCE;
5681        wc->update_ref = update_ref;
5682        wc->keep_locks = 0;
5683        wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
5684
5685        while (1) {
5686
5687                ret = walk_down_tree(trans, root, path, wc);
5688                if (ret < 0) {
5689                        err = ret;
5690                        break;
5691                }
5692
5693                ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
5694                if (ret < 0) {
5695                        err = ret;
5696                        break;
5697                }
5698
5699                if (ret > 0) {
5700                        BUG_ON(wc->stage != DROP_REFERENCE);
5701                        break;
5702                }
5703
5704                if (wc->stage == DROP_REFERENCE) {
5705                        wc->drop_level = wc->level;
5706                        btrfs_node_key_to_cpu(path->nodes[wc->drop_level],
5707                                              &wc->drop_progress,
5708                                              path->slots[wc->drop_level]);
5709                }
5710                btrfs_cpu_key_to_disk(&root_item->drop_progress,
5711                                      &wc->drop_progress);
5712                btrfs_set_root_drop_level(root_item, wc->drop_level);
5713
5714                BUG_ON(wc->level == 0);
5715                if (btrfs_should_end_transaction(trans) ||
5716                    (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) {
5717                        ret = btrfs_update_root(trans, tree_root,
5718                                                &root->root_key,
5719                                                root_item);
5720                        if (ret) {
5721                                btrfs_abort_transaction(trans, ret);
5722                                err = ret;
5723                                goto out_end_trans;
5724                        }
5725
5726                        btrfs_end_transaction_throttle(trans);
5727                        if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
5728                                btrfs_debug(fs_info,
5729                                            "drop snapshot early exit");
5730                                err = -EAGAIN;
5731                                goto out_free;
5732                        }
5733
5734                       /*
5735                        * Use join to avoid potential EINTR from transaction
5736                        * start. See wait_reserve_ticket and the whole
5737                        * reservation callchain.
5738                        */
5739                        if (for_reloc)
5740                                trans = btrfs_join_transaction(tree_root);
5741                        else
5742                                trans = btrfs_start_transaction(tree_root, 0);
5743                        if (IS_ERR(trans)) {
5744                                err = PTR_ERR(trans);
5745                                goto out_free;
5746                        }
5747                }
5748        }
5749        btrfs_release_path(path);
5750        if (err)
5751                goto out_end_trans;
5752
5753        ret = btrfs_del_root(trans, &root->root_key);
5754        if (ret) {
5755                btrfs_abort_transaction(trans, ret);
5756                err = ret;
5757                goto out_end_trans;
5758        }
5759
5760        if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
5761                ret = btrfs_find_root(tree_root, &root->root_key, path,
5762                                      NULL, NULL);
5763                if (ret < 0) {
5764                        btrfs_abort_transaction(trans, ret);
5765                        err = ret;
5766                        goto out_end_trans;
5767                } else if (ret > 0) {
5768                        /* if we fail to delete the orphan item this time
5769                         * around, it'll get picked up the next time.
5770                         *
5771                         * The most common failure here is just -ENOENT.
5772                         */
5773                        btrfs_del_orphan_item(trans, tree_root,
5774                                              root->root_key.objectid);
5775                }
5776        }
5777
5778        /*
5779         * This subvolume is going to be completely dropped, and won't be
5780         * recorded as dirty roots, thus pertrans meta rsv will not be freed at
5781         * commit transaction time.  So free it here manually.
5782         */
5783        btrfs_qgroup_convert_reserved_meta(root, INT_MAX);
5784        btrfs_qgroup_free_meta_all_pertrans(root);
5785
5786        if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state))
5787                btrfs_add_dropped_root(trans, root);
5788        else
5789                btrfs_put_root(root);
5790        root_dropped = true;
5791out_end_trans:
5792        btrfs_end_transaction_throttle(trans);
5793out_free:
5794        kfree(wc);
5795        btrfs_free_path(path);
5796out:
5797        /*
5798         * So if we need to stop dropping the snapshot for whatever reason we
5799         * need to make sure to add it back to the dead root list so that we
5800         * keep trying to do the work later.  This also cleans up roots if we
5801         * don't have it in the radix (like when we recover after a power fail
5802         * or unmount) so we don't leak memory.
5803         */
5804        if (!for_reloc && !root_dropped)
5805                btrfs_add_dead_root(root);
5806        return err;
5807}
5808
5809/*
5810 * drop subtree rooted at tree block 'node'.
5811 *
5812 * NOTE: this function will unlock and release tree block 'node'
5813 * only used by relocation code
5814 */
5815int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
5816                        struct btrfs_root *root,
5817                        struct extent_buffer *node,
5818                        struct extent_buffer *parent)
5819{
5820        struct btrfs_fs_info *fs_info = root->fs_info;
5821        struct btrfs_path *path;
5822        struct walk_control *wc;
5823        int level;
5824        int parent_level;
5825        int ret = 0;
5826        int wret;
5827
5828        BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
5829
5830        path = btrfs_alloc_path();
5831        if (!path)
5832                return -ENOMEM;
5833
5834        wc = kzalloc(sizeof(*wc), GFP_NOFS);
5835        if (!wc) {
5836                btrfs_free_path(path);
5837                return -ENOMEM;
5838        }
5839
5840        btrfs_assert_tree_write_locked(parent);
5841        parent_level = btrfs_header_level(parent);
5842        atomic_inc(&parent->refs);
5843        path->nodes[parent_level] = parent;
5844        path->slots[parent_level] = btrfs_header_nritems(parent);
5845
5846        btrfs_assert_tree_write_locked(node);
5847        level = btrfs_header_level(node);
5848        path->nodes[level] = node;
5849        path->slots[level] = 0;
5850        path->locks[level] = BTRFS_WRITE_LOCK;
5851
5852        wc->refs[parent_level] = 1;
5853        wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
5854        wc->level = level;
5855        wc->shared_level = -1;
5856        wc->stage = DROP_REFERENCE;
5857        wc->update_ref = 0;
5858        wc->keep_locks = 1;
5859        wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
5860
5861        while (1) {
5862                wret = walk_down_tree(trans, root, path, wc);
5863                if (wret < 0) {
5864                        ret = wret;
5865                        break;
5866                }
5867
5868                wret = walk_up_tree(trans, root, path, wc, parent_level);
5869                if (wret < 0)
5870                        ret = wret;
5871                if (wret != 0)
5872                        break;
5873        }
5874
5875        kfree(wc);
5876        btrfs_free_path(path);
5877        return ret;
5878}
5879
5880/*
5881 * helper to account the unused space of all the readonly block group in the
5882 * space_info. takes mirrors into account.
5883 */
5884u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
5885{
5886        struct btrfs_block_group *block_group;
5887        u64 free_bytes = 0;
5888        int factor;
5889
5890        /* It's df, we don't care if it's racy */
5891        if (list_empty(&sinfo->ro_bgs))
5892                return 0;
5893
5894        spin_lock(&sinfo->lock);
5895        list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
5896                spin_lock(&block_group->lock);
5897
5898                if (!block_group->ro) {
5899                        spin_unlock(&block_group->lock);
5900                        continue;
5901                }
5902
5903                factor = btrfs_bg_type_to_factor(block_group->flags);
5904                free_bytes += (block_group->length -
5905                               block_group->used) * factor;
5906
5907                spin_unlock(&block_group->lock);
5908        }
5909        spin_unlock(&sinfo->lock);
5910
5911        return free_bytes;
5912}
5913
5914int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
5915                                   u64 start, u64 end)
5916{
5917        return unpin_extent_range(fs_info, start, end, false);
5918}
5919
5920/*
5921 * It used to be that old block groups would be left around forever.
5922 * Iterating over them would be enough to trim unused space.  Since we
5923 * now automatically remove them, we also need to iterate over unallocated
5924 * space.
5925 *
5926 * We don't want a transaction for this since the discard may take a
5927 * substantial amount of time.  We don't require that a transaction be
5928 * running, but we do need to take a running transaction into account
5929 * to ensure that we're not discarding chunks that were released or
5930 * allocated in the current transaction.
5931 *
5932 * Holding the chunks lock will prevent other threads from allocating
5933 * or releasing chunks, but it won't prevent a running transaction
5934 * from committing and releasing the memory that the pending chunks
5935 * list head uses.  For that, we need to take a reference to the
5936 * transaction and hold the commit root sem.  We only need to hold
5937 * it while performing the free space search since we have already
5938 * held back allocations.
5939 */
5940static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
5941{
5942        u64 start = SZ_1M, len = 0, end = 0;
5943        int ret;
5944
5945        *trimmed = 0;
5946
5947        /* Discard not supported = nothing to do. */
5948        if (!blk_queue_discard(bdev_get_queue(device->bdev)))
5949                return 0;
5950
5951        /* Not writable = nothing to do. */
5952        if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
5953                return 0;
5954
5955        /* No free space = nothing to do. */
5956        if (device->total_bytes <= device->bytes_used)
5957                return 0;
5958
5959        ret = 0;
5960
5961        while (1) {
5962                struct btrfs_fs_info *fs_info = device->fs_info;
5963                u64 bytes;
5964
5965                ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
5966                if (ret)
5967                        break;
5968
5969                find_first_clear_extent_bit(&device->alloc_state, start,
5970                                            &start, &end,
5971                                            CHUNK_TRIMMED | CHUNK_ALLOCATED);
5972
5973                /* Check if there are any CHUNK_* bits left */
5974                if (start > device->total_bytes) {
5975                        WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
5976                        btrfs_warn_in_rcu(fs_info,
5977"ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu",
5978                                          start, end - start + 1,
5979                                          rcu_str_deref(device->name),
5980                                          device->total_bytes);
5981                        mutex_unlock(&fs_info->chunk_mutex);
5982                        ret = 0;
5983                        break;
5984                }
5985
5986                /* Ensure we skip the reserved area in the first 1M */
5987                start = max_t(u64, start, SZ_1M);
5988
5989                /*
5990                 * If find_first_clear_extent_bit find a range that spans the
5991                 * end of the device it will set end to -1, in this case it's up
5992                 * to the caller to trim the value to the size of the device.
5993                 */
5994                end = min(end, device->total_bytes - 1);
5995
5996                len = end - start + 1;
5997
5998                /* We didn't find any extents */
5999                if (!len) {
6000                        mutex_unlock(&fs_info->chunk_mutex);
6001                        ret = 0;
6002                        break;
6003                }
6004
6005                ret = btrfs_issue_discard(device->bdev, start, len,
6006                                          &bytes);
6007                if (!ret)
6008                        set_extent_bits(&device->alloc_state, start,
6009                                        start + bytes - 1,
6010                                        CHUNK_TRIMMED);
6011                mutex_unlock(&fs_info->chunk_mutex);
6012
6013                if (ret)
6014                        break;
6015
6016                start += len;
6017                *trimmed += bytes;
6018
6019                if (fatal_signal_pending(current)) {
6020                        ret = -ERESTARTSYS;
6021                        break;
6022                }
6023
6024                cond_resched();
6025        }
6026
6027        return ret;
6028}
6029
6030/*
6031 * Trim the whole filesystem by:
6032 * 1) trimming the free space in each block group
6033 * 2) trimming the unallocated space on each device
6034 *
6035 * This will also continue trimming even if a block group or device encounters
6036 * an error.  The return value will be the last error, or 0 if nothing bad
6037 * happens.
6038 */
6039int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
6040{
6041        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6042        struct btrfs_block_group *cache = NULL;
6043        struct btrfs_device *device;
6044        u64 group_trimmed;
6045        u64 range_end = U64_MAX;
6046        u64 start;
6047        u64 end;
6048        u64 trimmed = 0;
6049        u64 bg_failed = 0;
6050        u64 dev_failed = 0;
6051        int bg_ret = 0;
6052        int dev_ret = 0;
6053        int ret = 0;
6054
6055        if (range->start == U64_MAX)
6056                return -EINVAL;
6057
6058        /*
6059         * Check range overflow if range->len is set.
6060         * The default range->len is U64_MAX.
6061         */
6062        if (range->len != U64_MAX &&
6063            check_add_overflow(range->start, range->len, &range_end))
6064                return -EINVAL;
6065
6066        cache = btrfs_lookup_first_block_group(fs_info, range->start);
6067        for (; cache; cache = btrfs_next_block_group(cache)) {
6068                if (cache->start >= range_end) {
6069                        btrfs_put_block_group(cache);
6070                        break;
6071                }
6072
6073                start = max(range->start, cache->start);
6074                end = min(range_end, cache->start + cache->length);
6075
6076                if (end - start >= range->minlen) {
6077                        if (!btrfs_block_group_done(cache)) {
6078                                ret = btrfs_cache_block_group(cache, 0);
6079                                if (ret) {
6080                                        bg_failed++;
6081                                        bg_ret = ret;
6082                                        continue;
6083                                }
6084                                ret = btrfs_wait_block_group_cache_done(cache);
6085                                if (ret) {
6086                                        bg_failed++;
6087                                        bg_ret = ret;
6088                                        continue;
6089                                }
6090                        }
6091                        ret = btrfs_trim_block_group(cache,
6092                                                     &group_trimmed,
6093                                                     start,
6094                                                     end,
6095                                                     range->minlen);
6096
6097                        trimmed += group_trimmed;
6098                        if (ret) {
6099                                bg_failed++;
6100                                bg_ret = ret;
6101                                continue;
6102                        }
6103                }
6104        }
6105
6106        if (bg_failed)
6107                btrfs_warn(fs_info,
6108                        "failed to trim %llu block group(s), last error %d",
6109                        bg_failed, bg_ret);
6110
6111        mutex_lock(&fs_devices->device_list_mutex);
6112        list_for_each_entry(device, &fs_devices->devices, dev_list) {
6113                if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
6114                        continue;
6115
6116                ret = btrfs_trim_free_extents(device, &group_trimmed);
6117                if (ret) {
6118                        dev_failed++;
6119                        dev_ret = ret;
6120                        break;
6121                }
6122
6123                trimmed += group_trimmed;
6124        }
6125        mutex_unlock(&fs_devices->device_list_mutex);
6126
6127        if (dev_failed)
6128                btrfs_warn(fs_info,
6129                        "failed to trim %llu device(s), last error %d",
6130                        dev_failed, dev_ret);
6131        range->len = trimmed;
6132        if (bg_ret)
6133                return bg_ret;
6134        return dev_ret;
6135}
6136