linux/fs/btrfs/extent-tree.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (C) 2007 Oracle.  All rights reserved.
   4 */
   5
   6#include <linux/sched.h>
   7#include <linux/sched/signal.h>
   8#include <linux/pagemap.h>
   9#include <linux/writeback.h>
  10#include <linux/blkdev.h>
  11#include <linux/sort.h>
  12#include <linux/rcupdate.h>
  13#include <linux/kthread.h>
  14#include <linux/slab.h>
  15#include <linux/ratelimit.h>
  16#include <linux/percpu_counter.h>
  17#include <linux/lockdep.h>
  18#include <linux/crc32c.h>
  19#include "misc.h"
  20#include "tree-log.h"
  21#include "disk-io.h"
  22#include "print-tree.h"
  23#include "volumes.h"
  24#include "raid56.h"
  25#include "locking.h"
  26#include "free-space-cache.h"
  27#include "free-space-tree.h"
  28#include "sysfs.h"
  29#include "qgroup.h"
  30#include "ref-verify.h"
  31#include "space-info.h"
  32#include "block-rsv.h"
  33#include "delalloc-space.h"
  34#include "block-group.h"
  35#include "discard.h"
  36#include "rcu-string.h"
  37#include "zoned.h"
  38#include "dev-replace.h"
  39
  40#undef SCRAMBLE_DELAYED_REFS
  41
  42
  43static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
  44                               struct btrfs_delayed_ref_node *node, u64 parent,
  45                               u64 root_objectid, u64 owner_objectid,
  46                               u64 owner_offset, int refs_to_drop,
  47                               struct btrfs_delayed_extent_op *extra_op);
  48static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
  49                                    struct extent_buffer *leaf,
  50                                    struct btrfs_extent_item *ei);
  51static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
  52                                      u64 parent, u64 root_objectid,
  53                                      u64 flags, u64 owner, u64 offset,
  54                                      struct btrfs_key *ins, int ref_mod);
  55static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
  56                                     struct btrfs_delayed_ref_node *node,
  57                                     struct btrfs_delayed_extent_op *extent_op);
  58static int find_next_key(struct btrfs_path *path, int level,
  59                         struct btrfs_key *key);
  60
  61static int block_group_bits(struct btrfs_block_group *cache, u64 bits)
  62{
  63        return (cache->flags & bits) == bits;
  64}
  65
  66int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info,
  67                              u64 start, u64 num_bytes)
  68{
  69        u64 end = start + num_bytes - 1;
  70        set_extent_bits(&fs_info->excluded_extents, start, end,
  71                        EXTENT_UPTODATE);
  72        return 0;
  73}
  74
  75void btrfs_free_excluded_extents(struct btrfs_block_group *cache)
  76{
  77        struct btrfs_fs_info *fs_info = cache->fs_info;
  78        u64 start, end;
  79
  80        start = cache->start;
  81        end = start + cache->length - 1;
  82
  83        clear_extent_bits(&fs_info->excluded_extents, start, end,
  84                          EXTENT_UPTODATE);
  85}
  86
  87/* simple helper to search for an existing data extent at a given offset */
  88int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
  89{
  90        struct btrfs_root *root = btrfs_extent_root(fs_info, start);
  91        int ret;
  92        struct btrfs_key key;
  93        struct btrfs_path *path;
  94
  95        path = btrfs_alloc_path();
  96        if (!path)
  97                return -ENOMEM;
  98
  99        key.objectid = start;
 100        key.offset = len;
 101        key.type = BTRFS_EXTENT_ITEM_KEY;
 102        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 103        btrfs_free_path(path);
 104        return ret;
 105}
 106
 107/*
 108 * helper function to lookup reference count and flags of a tree block.
 109 *
 110 * the head node for delayed ref is used to store the sum of all the
 111 * reference count modifications queued up in the rbtree. the head
 112 * node may also store the extent flags to set. This way you can check
 113 * to see what the reference count and extent flags would be if all of
 114 * the delayed refs are not processed.
 115 */
 116int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
 117                             struct btrfs_fs_info *fs_info, u64 bytenr,
 118                             u64 offset, int metadata, u64 *refs, u64 *flags)
 119{
 120        struct btrfs_root *extent_root;
 121        struct btrfs_delayed_ref_head *head;
 122        struct btrfs_delayed_ref_root *delayed_refs;
 123        struct btrfs_path *path;
 124        struct btrfs_extent_item *ei;
 125        struct extent_buffer *leaf;
 126        struct btrfs_key key;
 127        u32 item_size;
 128        u64 num_refs;
 129        u64 extent_flags;
 130        int ret;
 131
 132        /*
 133         * If we don't have skinny metadata, don't bother doing anything
 134         * different
 135         */
 136        if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
 137                offset = fs_info->nodesize;
 138                metadata = 0;
 139        }
 140
 141        path = btrfs_alloc_path();
 142        if (!path)
 143                return -ENOMEM;
 144
 145        if (!trans) {
 146                path->skip_locking = 1;
 147                path->search_commit_root = 1;
 148        }
 149
 150search_again:
 151        key.objectid = bytenr;
 152        key.offset = offset;
 153        if (metadata)
 154                key.type = BTRFS_METADATA_ITEM_KEY;
 155        else
 156                key.type = BTRFS_EXTENT_ITEM_KEY;
 157
 158        extent_root = btrfs_extent_root(fs_info, bytenr);
 159        ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
 160        if (ret < 0)
 161                goto out_free;
 162
 163        if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
 164                if (path->slots[0]) {
 165                        path->slots[0]--;
 166                        btrfs_item_key_to_cpu(path->nodes[0], &key,
 167                                              path->slots[0]);
 168                        if (key.objectid == bytenr &&
 169                            key.type == BTRFS_EXTENT_ITEM_KEY &&
 170                            key.offset == fs_info->nodesize)
 171                                ret = 0;
 172                }
 173        }
 174
 175        if (ret == 0) {
 176                leaf = path->nodes[0];
 177                item_size = btrfs_item_size(leaf, path->slots[0]);
 178                if (item_size >= sizeof(*ei)) {
 179                        ei = btrfs_item_ptr(leaf, path->slots[0],
 180                                            struct btrfs_extent_item);
 181                        num_refs = btrfs_extent_refs(leaf, ei);
 182                        extent_flags = btrfs_extent_flags(leaf, ei);
 183                } else {
 184                        ret = -EINVAL;
 185                        btrfs_print_v0_err(fs_info);
 186                        if (trans)
 187                                btrfs_abort_transaction(trans, ret);
 188                        else
 189                                btrfs_handle_fs_error(fs_info, ret, NULL);
 190
 191                        goto out_free;
 192                }
 193
 194                BUG_ON(num_refs == 0);
 195        } else {
 196                num_refs = 0;
 197                extent_flags = 0;
 198                ret = 0;
 199        }
 200
 201        if (!trans)
 202                goto out;
 203
 204        delayed_refs = &trans->transaction->delayed_refs;
 205        spin_lock(&delayed_refs->lock);
 206        head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
 207        if (head) {
 208                if (!mutex_trylock(&head->mutex)) {
 209                        refcount_inc(&head->refs);
 210                        spin_unlock(&delayed_refs->lock);
 211
 212                        btrfs_release_path(path);
 213
 214                        /*
 215                         * Mutex was contended, block until it's released and try
 216                         * again
 217                         */
 218                        mutex_lock(&head->mutex);
 219                        mutex_unlock(&head->mutex);
 220                        btrfs_put_delayed_ref_head(head);
 221                        goto search_again;
 222                }
 223                spin_lock(&head->lock);
 224                if (head->extent_op && head->extent_op->update_flags)
 225                        extent_flags |= head->extent_op->flags_to_set;
 226                else
 227                        BUG_ON(num_refs == 0);
 228
 229                num_refs += head->ref_mod;
 230                spin_unlock(&head->lock);
 231                mutex_unlock(&head->mutex);
 232        }
 233        spin_unlock(&delayed_refs->lock);
 234out:
 235        WARN_ON(num_refs == 0);
 236        if (refs)
 237                *refs = num_refs;
 238        if (flags)
 239                *flags = extent_flags;
 240out_free:
 241        btrfs_free_path(path);
 242        return ret;
 243}
 244
 245/*
 246 * Back reference rules.  Back refs have three main goals:
 247 *
 248 * 1) differentiate between all holders of references to an extent so that
 249 *    when a reference is dropped we can make sure it was a valid reference
 250 *    before freeing the extent.
 251 *
 252 * 2) Provide enough information to quickly find the holders of an extent
 253 *    if we notice a given block is corrupted or bad.
 254 *
 255 * 3) Make it easy to migrate blocks for FS shrinking or storage pool
 256 *    maintenance.  This is actually the same as #2, but with a slightly
 257 *    different use case.
 258 *
 259 * There are two kinds of back refs. The implicit back refs is optimized
 260 * for pointers in non-shared tree blocks. For a given pointer in a block,
 261 * back refs of this kind provide information about the block's owner tree
 262 * and the pointer's key. These information allow us to find the block by
 263 * b-tree searching. The full back refs is for pointers in tree blocks not
 264 * referenced by their owner trees. The location of tree block is recorded
 265 * in the back refs. Actually the full back refs is generic, and can be
 266 * used in all cases the implicit back refs is used. The major shortcoming
 267 * of the full back refs is its overhead. Every time a tree block gets
 268 * COWed, we have to update back refs entry for all pointers in it.
 269 *
 270 * For a newly allocated tree block, we use implicit back refs for
 271 * pointers in it. This means most tree related operations only involve
 272 * implicit back refs. For a tree block created in old transaction, the
 273 * only way to drop a reference to it is COW it. So we can detect the
 274 * event that tree block loses its owner tree's reference and do the
 275 * back refs conversion.
 276 *
 277 * When a tree block is COWed through a tree, there are four cases:
 278 *
 279 * The reference count of the block is one and the tree is the block's
 280 * owner tree. Nothing to do in this case.
 281 *
 282 * The reference count of the block is one and the tree is not the
 283 * block's owner tree. In this case, full back refs is used for pointers
 284 * in the block. Remove these full back refs, add implicit back refs for
 285 * every pointers in the new block.
 286 *
 287 * The reference count of the block is greater than one and the tree is
 288 * the block's owner tree. In this case, implicit back refs is used for
 289 * pointers in the block. Add full back refs for every pointers in the
 290 * block, increase lower level extents' reference counts. The original
 291 * implicit back refs are entailed to the new block.
 292 *
 293 * The reference count of the block is greater than one and the tree is
 294 * not the block's owner tree. Add implicit back refs for every pointer in
 295 * the new block, increase lower level extents' reference count.
 296 *
 297 * Back Reference Key composing:
 298 *
 299 * The key objectid corresponds to the first byte in the extent,
 300 * The key type is used to differentiate between types of back refs.
 301 * There are different meanings of the key offset for different types
 302 * of back refs.
 303 *
 304 * File extents can be referenced by:
 305 *
 306 * - multiple snapshots, subvolumes, or different generations in one subvol
 307 * - different files inside a single subvolume
 308 * - different offsets inside a file (bookend extents in file.c)
 309 *
 310 * The extent ref structure for the implicit back refs has fields for:
 311 *
 312 * - Objectid of the subvolume root
 313 * - objectid of the file holding the reference
 314 * - original offset in the file
 315 * - how many bookend extents
 316 *
 317 * The key offset for the implicit back refs is hash of the first
 318 * three fields.
 319 *
 320 * The extent ref structure for the full back refs has field for:
 321 *
 322 * - number of pointers in the tree leaf
 323 *
 324 * The key offset for the implicit back refs is the first byte of
 325 * the tree leaf
 326 *
 327 * When a file extent is allocated, The implicit back refs is used.
 328 * the fields are filled in:
 329 *
 330 *     (root_key.objectid, inode objectid, offset in file, 1)
 331 *
 332 * When a file extent is removed file truncation, we find the
 333 * corresponding implicit back refs and check the following fields:
 334 *
 335 *     (btrfs_header_owner(leaf), inode objectid, offset in file)
 336 *
 337 * Btree extents can be referenced by:
 338 *
 339 * - Different subvolumes
 340 *
 341 * Both the implicit back refs and the full back refs for tree blocks
 342 * only consist of key. The key offset for the implicit back refs is
 343 * objectid of block's owner tree. The key offset for the full back refs
 344 * is the first byte of parent block.
 345 *
 346 * When implicit back refs is used, information about the lowest key and
 347 * level of the tree block are required. These information are stored in
 348 * tree block info structure.
 349 */
 350
 351/*
 352 * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
 353 * is_data == BTRFS_REF_TYPE_DATA, data type is requiried,
 354 * is_data == BTRFS_REF_TYPE_ANY, either type is OK.
 355 */
 356int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
 357                                     struct btrfs_extent_inline_ref *iref,
 358                                     enum btrfs_inline_ref_type is_data)
 359{
 360        int type = btrfs_extent_inline_ref_type(eb, iref);
 361        u64 offset = btrfs_extent_inline_ref_offset(eb, iref);
 362
 363        if (type == BTRFS_TREE_BLOCK_REF_KEY ||
 364            type == BTRFS_SHARED_BLOCK_REF_KEY ||
 365            type == BTRFS_SHARED_DATA_REF_KEY ||
 366            type == BTRFS_EXTENT_DATA_REF_KEY) {
 367                if (is_data == BTRFS_REF_TYPE_BLOCK) {
 368                        if (type == BTRFS_TREE_BLOCK_REF_KEY)
 369                                return type;
 370                        if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
 371                                ASSERT(eb->fs_info);
 372                                /*
 373                                 * Every shared one has parent tree block,
 374                                 * which must be aligned to sector size.
 375                                 */
 376                                if (offset &&
 377                                    IS_ALIGNED(offset, eb->fs_info->sectorsize))
 378                                        return type;
 379                        }
 380                } else if (is_data == BTRFS_REF_TYPE_DATA) {
 381                        if (type == BTRFS_EXTENT_DATA_REF_KEY)
 382                                return type;
 383                        if (type == BTRFS_SHARED_DATA_REF_KEY) {
 384                                ASSERT(eb->fs_info);
 385                                /*
 386                                 * Every shared one has parent tree block,
 387                                 * which must be aligned to sector size.
 388                                 */
 389                                if (offset &&
 390                                    IS_ALIGNED(offset, eb->fs_info->sectorsize))
 391                                        return type;
 392                        }
 393                } else {
 394                        ASSERT(is_data == BTRFS_REF_TYPE_ANY);
 395                        return type;
 396                }
 397        }
 398
 399        btrfs_print_leaf((struct extent_buffer *)eb);
 400        btrfs_err(eb->fs_info,
 401                  "eb %llu iref 0x%lx invalid extent inline ref type %d",
 402                  eb->start, (unsigned long)iref, type);
 403        WARN_ON(1);
 404
 405        return BTRFS_REF_TYPE_INVALID;
 406}
 407
 408u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
 409{
 410        u32 high_crc = ~(u32)0;
 411        u32 low_crc = ~(u32)0;
 412        __le64 lenum;
 413
 414        lenum = cpu_to_le64(root_objectid);
 415        high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
 416        lenum = cpu_to_le64(owner);
 417        low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
 418        lenum = cpu_to_le64(offset);
 419        low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
 420
 421        return ((u64)high_crc << 31) ^ (u64)low_crc;
 422}
 423
 424static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
 425                                     struct btrfs_extent_data_ref *ref)
 426{
 427        return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
 428                                    btrfs_extent_data_ref_objectid(leaf, ref),
 429                                    btrfs_extent_data_ref_offset(leaf, ref));
 430}
 431
 432static int match_extent_data_ref(struct extent_buffer *leaf,
 433                                 struct btrfs_extent_data_ref *ref,
 434                                 u64 root_objectid, u64 owner, u64 offset)
 435{
 436        if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
 437            btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
 438            btrfs_extent_data_ref_offset(leaf, ref) != offset)
 439                return 0;
 440        return 1;
 441}
 442
 443static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
 444                                           struct btrfs_path *path,
 445                                           u64 bytenr, u64 parent,
 446                                           u64 root_objectid,
 447                                           u64 owner, u64 offset)
 448{
 449        struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
 450        struct btrfs_key key;
 451        struct btrfs_extent_data_ref *ref;
 452        struct extent_buffer *leaf;
 453        u32 nritems;
 454        int ret;
 455        int recow;
 456        int err = -ENOENT;
 457
 458        key.objectid = bytenr;
 459        if (parent) {
 460                key.type = BTRFS_SHARED_DATA_REF_KEY;
 461                key.offset = parent;
 462        } else {
 463                key.type = BTRFS_EXTENT_DATA_REF_KEY;
 464                key.offset = hash_extent_data_ref(root_objectid,
 465                                                  owner, offset);
 466        }
 467again:
 468        recow = 0;
 469        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 470        if (ret < 0) {
 471                err = ret;
 472                goto fail;
 473        }
 474
 475        if (parent) {
 476                if (!ret)
 477                        return 0;
 478                goto fail;
 479        }
 480
 481        leaf = path->nodes[0];
 482        nritems = btrfs_header_nritems(leaf);
 483        while (1) {
 484                if (path->slots[0] >= nritems) {
 485                        ret = btrfs_next_leaf(root, path);
 486                        if (ret < 0)
 487                                err = ret;
 488                        if (ret)
 489                                goto fail;
 490
 491                        leaf = path->nodes[0];
 492                        nritems = btrfs_header_nritems(leaf);
 493                        recow = 1;
 494                }
 495
 496                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 497                if (key.objectid != bytenr ||
 498                    key.type != BTRFS_EXTENT_DATA_REF_KEY)
 499                        goto fail;
 500
 501                ref = btrfs_item_ptr(leaf, path->slots[0],
 502                                     struct btrfs_extent_data_ref);
 503
 504                if (match_extent_data_ref(leaf, ref, root_objectid,
 505                                          owner, offset)) {
 506                        if (recow) {
 507                                btrfs_release_path(path);
 508                                goto again;
 509                        }
 510                        err = 0;
 511                        break;
 512                }
 513                path->slots[0]++;
 514        }
 515fail:
 516        return err;
 517}
 518
 519static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
 520                                           struct btrfs_path *path,
 521                                           u64 bytenr, u64 parent,
 522                                           u64 root_objectid, u64 owner,
 523                                           u64 offset, int refs_to_add)
 524{
 525        struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
 526        struct btrfs_key key;
 527        struct extent_buffer *leaf;
 528        u32 size;
 529        u32 num_refs;
 530        int ret;
 531
 532        key.objectid = bytenr;
 533        if (parent) {
 534                key.type = BTRFS_SHARED_DATA_REF_KEY;
 535                key.offset = parent;
 536                size = sizeof(struct btrfs_shared_data_ref);
 537        } else {
 538                key.type = BTRFS_EXTENT_DATA_REF_KEY;
 539                key.offset = hash_extent_data_ref(root_objectid,
 540                                                  owner, offset);
 541                size = sizeof(struct btrfs_extent_data_ref);
 542        }
 543
 544        ret = btrfs_insert_empty_item(trans, root, path, &key, size);
 545        if (ret && ret != -EEXIST)
 546                goto fail;
 547
 548        leaf = path->nodes[0];
 549        if (parent) {
 550                struct btrfs_shared_data_ref *ref;
 551                ref = btrfs_item_ptr(leaf, path->slots[0],
 552                                     struct btrfs_shared_data_ref);
 553                if (ret == 0) {
 554                        btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
 555                } else {
 556                        num_refs = btrfs_shared_data_ref_count(leaf, ref);
 557                        num_refs += refs_to_add;
 558                        btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
 559                }
 560        } else {
 561                struct btrfs_extent_data_ref *ref;
 562                while (ret == -EEXIST) {
 563                        ref = btrfs_item_ptr(leaf, path->slots[0],
 564                                             struct btrfs_extent_data_ref);
 565                        if (match_extent_data_ref(leaf, ref, root_objectid,
 566                                                  owner, offset))
 567                                break;
 568                        btrfs_release_path(path);
 569                        key.offset++;
 570                        ret = btrfs_insert_empty_item(trans, root, path, &key,
 571                                                      size);
 572                        if (ret && ret != -EEXIST)
 573                                goto fail;
 574
 575                        leaf = path->nodes[0];
 576                }
 577                ref = btrfs_item_ptr(leaf, path->slots[0],
 578                                     struct btrfs_extent_data_ref);
 579                if (ret == 0) {
 580                        btrfs_set_extent_data_ref_root(leaf, ref,
 581                                                       root_objectid);
 582                        btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
 583                        btrfs_set_extent_data_ref_offset(leaf, ref, offset);
 584                        btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
 585                } else {
 586                        num_refs = btrfs_extent_data_ref_count(leaf, ref);
 587                        num_refs += refs_to_add;
 588                        btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
 589                }
 590        }
 591        btrfs_mark_buffer_dirty(leaf);
 592        ret = 0;
 593fail:
 594        btrfs_release_path(path);
 595        return ret;
 596}
 597
 598static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
 599                                           struct btrfs_root *root,
 600                                           struct btrfs_path *path,
 601                                           int refs_to_drop)
 602{
 603        struct btrfs_key key;
 604        struct btrfs_extent_data_ref *ref1 = NULL;
 605        struct btrfs_shared_data_ref *ref2 = NULL;
 606        struct extent_buffer *leaf;
 607        u32 num_refs = 0;
 608        int ret = 0;
 609
 610        leaf = path->nodes[0];
 611        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 612
 613        if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
 614                ref1 = btrfs_item_ptr(leaf, path->slots[0],
 615                                      struct btrfs_extent_data_ref);
 616                num_refs = btrfs_extent_data_ref_count(leaf, ref1);
 617        } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
 618                ref2 = btrfs_item_ptr(leaf, path->slots[0],
 619                                      struct btrfs_shared_data_ref);
 620                num_refs = btrfs_shared_data_ref_count(leaf, ref2);
 621        } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) {
 622                btrfs_print_v0_err(trans->fs_info);
 623                btrfs_abort_transaction(trans, -EINVAL);
 624                return -EINVAL;
 625        } else {
 626                BUG();
 627        }
 628
 629        BUG_ON(num_refs < refs_to_drop);
 630        num_refs -= refs_to_drop;
 631
 632        if (num_refs == 0) {
 633                ret = btrfs_del_item(trans, root, path);
 634        } else {
 635                if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
 636                        btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
 637                else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
 638                        btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
 639                btrfs_mark_buffer_dirty(leaf);
 640        }
 641        return ret;
 642}
 643
 644static noinline u32 extent_data_ref_count(struct btrfs_path *path,
 645                                          struct btrfs_extent_inline_ref *iref)
 646{
 647        struct btrfs_key key;
 648        struct extent_buffer *leaf;
 649        struct btrfs_extent_data_ref *ref1;
 650        struct btrfs_shared_data_ref *ref2;
 651        u32 num_refs = 0;
 652        int type;
 653
 654        leaf = path->nodes[0];
 655        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 656
 657        BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
 658        if (iref) {
 659                /*
 660                 * If type is invalid, we should have bailed out earlier than
 661                 * this call.
 662                 */
 663                type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
 664                ASSERT(type != BTRFS_REF_TYPE_INVALID);
 665                if (type == BTRFS_EXTENT_DATA_REF_KEY) {
 666                        ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
 667                        num_refs = btrfs_extent_data_ref_count(leaf, ref1);
 668                } else {
 669                        ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
 670                        num_refs = btrfs_shared_data_ref_count(leaf, ref2);
 671                }
 672        } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
 673                ref1 = btrfs_item_ptr(leaf, path->slots[0],
 674                                      struct btrfs_extent_data_ref);
 675                num_refs = btrfs_extent_data_ref_count(leaf, ref1);
 676        } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
 677                ref2 = btrfs_item_ptr(leaf, path->slots[0],
 678                                      struct btrfs_shared_data_ref);
 679                num_refs = btrfs_shared_data_ref_count(leaf, ref2);
 680        } else {
 681                WARN_ON(1);
 682        }
 683        return num_refs;
 684}
 685
 686static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
 687                                          struct btrfs_path *path,
 688                                          u64 bytenr, u64 parent,
 689                                          u64 root_objectid)
 690{
 691        struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
 692        struct btrfs_key key;
 693        int ret;
 694
 695        key.objectid = bytenr;
 696        if (parent) {
 697                key.type = BTRFS_SHARED_BLOCK_REF_KEY;
 698                key.offset = parent;
 699        } else {
 700                key.type = BTRFS_TREE_BLOCK_REF_KEY;
 701                key.offset = root_objectid;
 702        }
 703
 704        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 705        if (ret > 0)
 706                ret = -ENOENT;
 707        return ret;
 708}
 709
 710static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
 711                                          struct btrfs_path *path,
 712                                          u64 bytenr, u64 parent,
 713                                          u64 root_objectid)
 714{
 715        struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
 716        struct btrfs_key key;
 717        int ret;
 718
 719        key.objectid = bytenr;
 720        if (parent) {
 721                key.type = BTRFS_SHARED_BLOCK_REF_KEY;
 722                key.offset = parent;
 723        } else {
 724                key.type = BTRFS_TREE_BLOCK_REF_KEY;
 725                key.offset = root_objectid;
 726        }
 727
 728        ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
 729        btrfs_release_path(path);
 730        return ret;
 731}
 732
 733static inline int extent_ref_type(u64 parent, u64 owner)
 734{
 735        int type;
 736        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
 737                if (parent > 0)
 738                        type = BTRFS_SHARED_BLOCK_REF_KEY;
 739                else
 740                        type = BTRFS_TREE_BLOCK_REF_KEY;
 741        } else {
 742                if (parent > 0)
 743                        type = BTRFS_SHARED_DATA_REF_KEY;
 744                else
 745                        type = BTRFS_EXTENT_DATA_REF_KEY;
 746        }
 747        return type;
 748}
 749
 750static int find_next_key(struct btrfs_path *path, int level,
 751                         struct btrfs_key *key)
 752
 753{
 754        for (; level < BTRFS_MAX_LEVEL; level++) {
 755                if (!path->nodes[level])
 756                        break;
 757                if (path->slots[level] + 1 >=
 758                    btrfs_header_nritems(path->nodes[level]))
 759                        continue;
 760                if (level == 0)
 761                        btrfs_item_key_to_cpu(path->nodes[level], key,
 762                                              path->slots[level] + 1);
 763                else
 764                        btrfs_node_key_to_cpu(path->nodes[level], key,
 765                                              path->slots[level] + 1);
 766                return 0;
 767        }
 768        return 1;
 769}
 770
 771/*
 772 * look for inline back ref. if back ref is found, *ref_ret is set
 773 * to the address of inline back ref, and 0 is returned.
 774 *
 775 * if back ref isn't found, *ref_ret is set to the address where it
 776 * should be inserted, and -ENOENT is returned.
 777 *
 778 * if insert is true and there are too many inline back refs, the path
 779 * points to the extent item, and -EAGAIN is returned.
 780 *
 781 * NOTE: inline back refs are ordered in the same way that back ref
 782 *       items in the tree are ordered.
 783 */
 784static noinline_for_stack
 785int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
 786                                 struct btrfs_path *path,
 787                                 struct btrfs_extent_inline_ref **ref_ret,
 788                                 u64 bytenr, u64 num_bytes,
 789                                 u64 parent, u64 root_objectid,
 790                                 u64 owner, u64 offset, int insert)
 791{
 792        struct btrfs_fs_info *fs_info = trans->fs_info;
 793        struct btrfs_root *root = btrfs_extent_root(fs_info, bytenr);
 794        struct btrfs_key key;
 795        struct extent_buffer *leaf;
 796        struct btrfs_extent_item *ei;
 797        struct btrfs_extent_inline_ref *iref;
 798        u64 flags;
 799        u64 item_size;
 800        unsigned long ptr;
 801        unsigned long end;
 802        int extra_size;
 803        int type;
 804        int want;
 805        int ret;
 806        int err = 0;
 807        bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
 808        int needed;
 809
 810        key.objectid = bytenr;
 811        key.type = BTRFS_EXTENT_ITEM_KEY;
 812        key.offset = num_bytes;
 813
 814        want = extent_ref_type(parent, owner);
 815        if (insert) {
 816                extra_size = btrfs_extent_inline_ref_size(want);
 817                path->search_for_extension = 1;
 818                path->keep_locks = 1;
 819        } else
 820                extra_size = -1;
 821
 822        /*
 823         * Owner is our level, so we can just add one to get the level for the
 824         * block we are interested in.
 825         */
 826        if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
 827                key.type = BTRFS_METADATA_ITEM_KEY;
 828                key.offset = owner;
 829        }
 830
 831again:
 832        ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
 833        if (ret < 0) {
 834                err = ret;
 835                goto out;
 836        }
 837
 838        /*
 839         * We may be a newly converted file system which still has the old fat
 840         * extent entries for metadata, so try and see if we have one of those.
 841         */
 842        if (ret > 0 && skinny_metadata) {
 843                skinny_metadata = false;
 844                if (path->slots[0]) {
 845                        path->slots[0]--;
 846                        btrfs_item_key_to_cpu(path->nodes[0], &key,
 847                                              path->slots[0]);
 848                        if (key.objectid == bytenr &&
 849                            key.type == BTRFS_EXTENT_ITEM_KEY &&
 850                            key.offset == num_bytes)
 851                                ret = 0;
 852                }
 853                if (ret) {
 854                        key.objectid = bytenr;
 855                        key.type = BTRFS_EXTENT_ITEM_KEY;
 856                        key.offset = num_bytes;
 857                        btrfs_release_path(path);
 858                        goto again;
 859                }
 860        }
 861
 862        if (ret && !insert) {
 863                err = -ENOENT;
 864                goto out;
 865        } else if (WARN_ON(ret)) {
 866                err = -EIO;
 867                goto out;
 868        }
 869
 870        leaf = path->nodes[0];
 871        item_size = btrfs_item_size(leaf, path->slots[0]);
 872        if (unlikely(item_size < sizeof(*ei))) {
 873                err = -EINVAL;
 874                btrfs_print_v0_err(fs_info);
 875                btrfs_abort_transaction(trans, err);
 876                goto out;
 877        }
 878
 879        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
 880        flags = btrfs_extent_flags(leaf, ei);
 881
 882        ptr = (unsigned long)(ei + 1);
 883        end = (unsigned long)ei + item_size;
 884
 885        if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
 886                ptr += sizeof(struct btrfs_tree_block_info);
 887                BUG_ON(ptr > end);
 888        }
 889
 890        if (owner >= BTRFS_FIRST_FREE_OBJECTID)
 891                needed = BTRFS_REF_TYPE_DATA;
 892        else
 893                needed = BTRFS_REF_TYPE_BLOCK;
 894
 895        err = -ENOENT;
 896        while (1) {
 897                if (ptr >= end) {
 898                        if (ptr > end) {
 899                                err = -EUCLEAN;
 900                                btrfs_print_leaf(path->nodes[0]);
 901                                btrfs_crit(fs_info,
 902"overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu",
 903                                        path->slots[0], root_objectid, owner, offset, parent);
 904                        }
 905                        break;
 906                }
 907                iref = (struct btrfs_extent_inline_ref *)ptr;
 908                type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
 909                if (type == BTRFS_REF_TYPE_INVALID) {
 910                        err = -EUCLEAN;
 911                        goto out;
 912                }
 913
 914                if (want < type)
 915                        break;
 916                if (want > type) {
 917                        ptr += btrfs_extent_inline_ref_size(type);
 918                        continue;
 919                }
 920
 921                if (type == BTRFS_EXTENT_DATA_REF_KEY) {
 922                        struct btrfs_extent_data_ref *dref;
 923                        dref = (struct btrfs_extent_data_ref *)(&iref->offset);
 924                        if (match_extent_data_ref(leaf, dref, root_objectid,
 925                                                  owner, offset)) {
 926                                err = 0;
 927                                break;
 928                        }
 929                        if (hash_extent_data_ref_item(leaf, dref) <
 930                            hash_extent_data_ref(root_objectid, owner, offset))
 931                                break;
 932                } else {
 933                        u64 ref_offset;
 934                        ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
 935                        if (parent > 0) {
 936                                if (parent == ref_offset) {
 937                                        err = 0;
 938                                        break;
 939                                }
 940                                if (ref_offset < parent)
 941                                        break;
 942                        } else {
 943                                if (root_objectid == ref_offset) {
 944                                        err = 0;
 945                                        break;
 946                                }
 947                                if (ref_offset < root_objectid)
 948                                        break;
 949                        }
 950                }
 951                ptr += btrfs_extent_inline_ref_size(type);
 952        }
 953        if (err == -ENOENT && insert) {
 954                if (item_size + extra_size >=
 955                    BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
 956                        err = -EAGAIN;
 957                        goto out;
 958                }
 959                /*
 960                 * To add new inline back ref, we have to make sure
 961                 * there is no corresponding back ref item.
 962                 * For simplicity, we just do not add new inline back
 963                 * ref if there is any kind of item for this block
 964                 */
 965                if (find_next_key(path, 0, &key) == 0 &&
 966                    key.objectid == bytenr &&
 967                    key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
 968                        err = -EAGAIN;
 969                        goto out;
 970                }
 971        }
 972        *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
 973out:
 974        if (insert) {
 975                path->keep_locks = 0;
 976                path->search_for_extension = 0;
 977                btrfs_unlock_up_safe(path, 1);
 978        }
 979        return err;
 980}
 981
 982/*
 983 * helper to add new inline back ref
 984 */
 985static noinline_for_stack
 986void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
 987                                 struct btrfs_path *path,
 988                                 struct btrfs_extent_inline_ref *iref,
 989                                 u64 parent, u64 root_objectid,
 990                                 u64 owner, u64 offset, int refs_to_add,
 991                                 struct btrfs_delayed_extent_op *extent_op)
 992{
 993        struct extent_buffer *leaf;
 994        struct btrfs_extent_item *ei;
 995        unsigned long ptr;
 996        unsigned long end;
 997        unsigned long item_offset;
 998        u64 refs;
 999        int size;
1000        int type;
1001
1002        leaf = path->nodes[0];
1003        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1004        item_offset = (unsigned long)iref - (unsigned long)ei;
1005
1006        type = extent_ref_type(parent, owner);
1007        size = btrfs_extent_inline_ref_size(type);
1008
1009        btrfs_extend_item(path, size);
1010
1011        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1012        refs = btrfs_extent_refs(leaf, ei);
1013        refs += refs_to_add;
1014        btrfs_set_extent_refs(leaf, ei, refs);
1015        if (extent_op)
1016                __run_delayed_extent_op(extent_op, leaf, ei);
1017
1018        ptr = (unsigned long)ei + item_offset;
1019        end = (unsigned long)ei + btrfs_item_size(leaf, path->slots[0]);
1020        if (ptr < end - size)
1021                memmove_extent_buffer(leaf, ptr + size, ptr,
1022                                      end - size - ptr);
1023
1024        iref = (struct btrfs_extent_inline_ref *)ptr;
1025        btrfs_set_extent_inline_ref_type(leaf, iref, type);
1026        if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1027                struct btrfs_extent_data_ref *dref;
1028                dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1029                btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1030                btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1031                btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1032                btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1033        } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1034                struct btrfs_shared_data_ref *sref;
1035                sref = (struct btrfs_shared_data_ref *)(iref + 1);
1036                btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1037                btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1038        } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1039                btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1040        } else {
1041                btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1042        }
1043        btrfs_mark_buffer_dirty(leaf);
1044}
1045
1046static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1047                                 struct btrfs_path *path,
1048                                 struct btrfs_extent_inline_ref **ref_ret,
1049                                 u64 bytenr, u64 num_bytes, u64 parent,
1050                                 u64 root_objectid, u64 owner, u64 offset)
1051{
1052        int ret;
1053
1054        ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr,
1055                                           num_bytes, parent, root_objectid,
1056                                           owner, offset, 0);
1057        if (ret != -ENOENT)
1058                return ret;
1059
1060        btrfs_release_path(path);
1061        *ref_ret = NULL;
1062
1063        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1064                ret = lookup_tree_block_ref(trans, path, bytenr, parent,
1065                                            root_objectid);
1066        } else {
1067                ret = lookup_extent_data_ref(trans, path, bytenr, parent,
1068                                             root_objectid, owner, offset);
1069        }
1070        return ret;
1071}
1072
1073/*
1074 * helper to update/remove inline back ref
1075 */
1076static noinline_for_stack
1077void update_inline_extent_backref(struct btrfs_path *path,
1078                                  struct btrfs_extent_inline_ref *iref,
1079                                  int refs_to_mod,
1080                                  struct btrfs_delayed_extent_op *extent_op)
1081{
1082        struct extent_buffer *leaf = path->nodes[0];
1083        struct btrfs_extent_item *ei;
1084        struct btrfs_extent_data_ref *dref = NULL;
1085        struct btrfs_shared_data_ref *sref = NULL;
1086        unsigned long ptr;
1087        unsigned long end;
1088        u32 item_size;
1089        int size;
1090        int type;
1091        u64 refs;
1092
1093        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1094        refs = btrfs_extent_refs(leaf, ei);
1095        WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1096        refs += refs_to_mod;
1097        btrfs_set_extent_refs(leaf, ei, refs);
1098        if (extent_op)
1099                __run_delayed_extent_op(extent_op, leaf, ei);
1100
1101        /*
1102         * If type is invalid, we should have bailed out after
1103         * lookup_inline_extent_backref().
1104         */
1105        type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
1106        ASSERT(type != BTRFS_REF_TYPE_INVALID);
1107
1108        if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1109                dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1110                refs = btrfs_extent_data_ref_count(leaf, dref);
1111        } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1112                sref = (struct btrfs_shared_data_ref *)(iref + 1);
1113                refs = btrfs_shared_data_ref_count(leaf, sref);
1114        } else {
1115                refs = 1;
1116                BUG_ON(refs_to_mod != -1);
1117        }
1118
1119        BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1120        refs += refs_to_mod;
1121
1122        if (refs > 0) {
1123                if (type == BTRFS_EXTENT_DATA_REF_KEY)
1124                        btrfs_set_extent_data_ref_count(leaf, dref, refs);
1125                else
1126                        btrfs_set_shared_data_ref_count(leaf, sref, refs);
1127        } else {
1128                size =  btrfs_extent_inline_ref_size(type);
1129                item_size = btrfs_item_size(leaf, path->slots[0]);
1130                ptr = (unsigned long)iref;
1131                end = (unsigned long)ei + item_size;
1132                if (ptr + size < end)
1133                        memmove_extent_buffer(leaf, ptr, ptr + size,
1134                                              end - ptr - size);
1135                item_size -= size;
1136                btrfs_truncate_item(path, item_size, 1);
1137        }
1138        btrfs_mark_buffer_dirty(leaf);
1139}
1140
1141static noinline_for_stack
1142int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1143                                 struct btrfs_path *path,
1144                                 u64 bytenr, u64 num_bytes, u64 parent,
1145                                 u64 root_objectid, u64 owner,
1146                                 u64 offset, int refs_to_add,
1147                                 struct btrfs_delayed_extent_op *extent_op)
1148{
1149        struct btrfs_extent_inline_ref *iref;
1150        int ret;
1151
1152        ret = lookup_inline_extent_backref(trans, path, &iref, bytenr,
1153                                           num_bytes, parent, root_objectid,
1154                                           owner, offset, 1);
1155        if (ret == 0) {
1156                /*
1157                 * We're adding refs to a tree block we already own, this
1158                 * should not happen at all.
1159                 */
1160                if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1161                        btrfs_crit(trans->fs_info,
1162"adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu",
1163                                   bytenr, num_bytes, root_objectid);
1164                        if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) {
1165                                WARN_ON(1);
1166                                btrfs_crit(trans->fs_info,
1167                        "path->slots[0]=%d path->nodes[0]:", path->slots[0]);
1168                                btrfs_print_leaf(path->nodes[0]);
1169                        }
1170                        return -EUCLEAN;
1171                }
1172                update_inline_extent_backref(path, iref, refs_to_add, extent_op);
1173        } else if (ret == -ENOENT) {
1174                setup_inline_extent_backref(trans->fs_info, path, iref, parent,
1175                                            root_objectid, owner, offset,
1176                                            refs_to_add, extent_op);
1177                ret = 0;
1178        }
1179        return ret;
1180}
1181
1182static int remove_extent_backref(struct btrfs_trans_handle *trans,
1183                                 struct btrfs_root *root,
1184                                 struct btrfs_path *path,
1185                                 struct btrfs_extent_inline_ref *iref,
1186                                 int refs_to_drop, int is_data)
1187{
1188        int ret = 0;
1189
1190        BUG_ON(!is_data && refs_to_drop != 1);
1191        if (iref)
1192                update_inline_extent_backref(path, iref, -refs_to_drop, NULL);
1193        else if (is_data)
1194                ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
1195        else
1196                ret = btrfs_del_item(trans, root, path);
1197        return ret;
1198}
1199
1200static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
1201                               u64 *discarded_bytes)
1202{
1203        int j, ret = 0;
1204        u64 bytes_left, end;
1205        u64 aligned_start = ALIGN(start, 1 << 9);
1206
1207        if (WARN_ON(start != aligned_start)) {
1208                len -= aligned_start - start;
1209                len = round_down(len, 1 << 9);
1210                start = aligned_start;
1211        }
1212
1213        *discarded_bytes = 0;
1214
1215        if (!len)
1216                return 0;
1217
1218        end = start + len;
1219        bytes_left = len;
1220
1221        /* Skip any superblocks on this device. */
1222        for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
1223                u64 sb_start = btrfs_sb_offset(j);
1224                u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
1225                u64 size = sb_start - start;
1226
1227                if (!in_range(sb_start, start, bytes_left) &&
1228                    !in_range(sb_end, start, bytes_left) &&
1229                    !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
1230                        continue;
1231
1232                /*
1233                 * Superblock spans beginning of range.  Adjust start and
1234                 * try again.
1235                 */
1236                if (sb_start <= start) {
1237                        start += sb_end - start;
1238                        if (start > end) {
1239                                bytes_left = 0;
1240                                break;
1241                        }
1242                        bytes_left = end - start;
1243                        continue;
1244                }
1245
1246                if (size) {
1247                        ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
1248                                                   GFP_NOFS);
1249                        if (!ret)
1250                                *discarded_bytes += size;
1251                        else if (ret != -EOPNOTSUPP)
1252                                return ret;
1253                }
1254
1255                start = sb_end;
1256                if (start > end) {
1257                        bytes_left = 0;
1258                        break;
1259                }
1260                bytes_left = end - start;
1261        }
1262
1263        if (bytes_left) {
1264                ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
1265                                           GFP_NOFS);
1266                if (!ret)
1267                        *discarded_bytes += bytes_left;
1268        }
1269        return ret;
1270}
1271
1272static int do_discard_extent(struct btrfs_io_stripe *stripe, u64 *bytes)
1273{
1274        struct btrfs_device *dev = stripe->dev;
1275        struct btrfs_fs_info *fs_info = dev->fs_info;
1276        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
1277        u64 phys = stripe->physical;
1278        u64 len = stripe->length;
1279        u64 discarded = 0;
1280        int ret = 0;
1281
1282        /* Zone reset on a zoned filesystem */
1283        if (btrfs_can_zone_reset(dev, phys, len)) {
1284                u64 src_disc;
1285
1286                ret = btrfs_reset_device_zone(dev, phys, len, &discarded);
1287                if (ret)
1288                        goto out;
1289
1290                if (!btrfs_dev_replace_is_ongoing(dev_replace) ||
1291                    dev != dev_replace->srcdev)
1292                        goto out;
1293
1294                src_disc = discarded;
1295
1296                /* Send to replace target as well */
1297                ret = btrfs_reset_device_zone(dev_replace->tgtdev, phys, len,
1298                                              &discarded);
1299                discarded += src_disc;
1300        } else if (bdev_max_discard_sectors(stripe->dev->bdev)) {
1301                ret = btrfs_issue_discard(dev->bdev, phys, len, &discarded);
1302        } else {
1303                ret = 0;
1304                *bytes = 0;
1305        }
1306
1307out:
1308        *bytes = discarded;
1309        return ret;
1310}
1311
1312int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
1313                         u64 num_bytes, u64 *actual_bytes)
1314{
1315        int ret = 0;
1316        u64 discarded_bytes = 0;
1317        u64 end = bytenr + num_bytes;
1318        u64 cur = bytenr;
1319        struct btrfs_io_context *bioc = NULL;
1320
1321        /*
1322         * Avoid races with device replace and make sure our bioc has devices
1323         * associated to its stripes that don't go away while we are discarding.
1324         */
1325        btrfs_bio_counter_inc_blocked(fs_info);
1326        while (cur < end) {
1327                struct btrfs_io_stripe *stripe;
1328                int i;
1329
1330                num_bytes = end - cur;
1331                /* Tell the block device(s) that the sectors can be discarded */
1332                ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, cur,
1333                                      &num_bytes, &bioc, 0);
1334                /*
1335                 * Error can be -ENOMEM, -ENOENT (no such chunk mapping) or
1336                 * -EOPNOTSUPP. For any such error, @num_bytes is not updated,
1337                 * thus we can't continue anyway.
1338                 */
1339                if (ret < 0)
1340                        goto out;
1341
1342                stripe = bioc->stripes;
1343                for (i = 0; i < bioc->num_stripes; i++, stripe++) {
1344                        u64 bytes;
1345                        struct btrfs_device *device = stripe->dev;
1346
1347                        if (!device->bdev) {
1348                                ASSERT(btrfs_test_opt(fs_info, DEGRADED));
1349                                continue;
1350                        }
1351
1352                        if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
1353                                continue;
1354
1355                        ret = do_discard_extent(stripe, &bytes);
1356                        if (!ret) {
1357                                discarded_bytes += bytes;
1358                        } else if (ret != -EOPNOTSUPP) {
1359                                /*
1360                                 * Logic errors or -ENOMEM, or -EIO, but
1361                                 * unlikely to happen.
1362                                 *
1363                                 * And since there are two loops, explicitly
1364                                 * go to out to avoid confusion.
1365                                 */
1366                                btrfs_put_bioc(bioc);
1367                                goto out;
1368                        }
1369
1370                        /*
1371                         * Just in case we get back EOPNOTSUPP for some reason,
1372                         * just ignore the return value so we don't screw up
1373                         * people calling discard_extent.
1374                         */
1375                        ret = 0;
1376                }
1377                btrfs_put_bioc(bioc);
1378                cur += num_bytes;
1379        }
1380out:
1381        btrfs_bio_counter_dec(fs_info);
1382
1383        if (actual_bytes)
1384                *actual_bytes = discarded_bytes;
1385
1386
1387        if (ret == -EOPNOTSUPP)
1388                ret = 0;
1389        return ret;
1390}
1391
1392/* Can return -ENOMEM */
1393int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1394                         struct btrfs_ref *generic_ref)
1395{
1396        struct btrfs_fs_info *fs_info = trans->fs_info;
1397        int ret;
1398
1399        ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
1400               generic_ref->action);
1401        BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
1402               generic_ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID);
1403
1404        if (generic_ref->type == BTRFS_REF_METADATA)
1405                ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL);
1406        else
1407                ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0);
1408
1409        btrfs_ref_tree_mod(fs_info, generic_ref);
1410
1411        return ret;
1412}
1413
1414/*
1415 * __btrfs_inc_extent_ref - insert backreference for a given extent
1416 *
1417 * The counterpart is in __btrfs_free_extent(), with examples and more details
1418 * how it works.
1419 *
1420 * @trans:          Handle of transaction
1421 *
1422 * @node:           The delayed ref node used to get the bytenr/length for
1423 *                  extent whose references are incremented.
1424 *
1425 * @parent:         If this is a shared extent (BTRFS_SHARED_DATA_REF_KEY/
1426 *                  BTRFS_SHARED_BLOCK_REF_KEY) then it holds the logical
1427 *                  bytenr of the parent block. Since new extents are always
1428 *                  created with indirect references, this will only be the case
1429 *                  when relocating a shared extent. In that case, root_objectid
1430 *                  will be BTRFS_TREE_RELOC_OBJECTID. Otherwise, parent must
1431 *                  be 0
1432 *
1433 * @root_objectid:  The id of the root where this modification has originated,
1434 *                  this can be either one of the well-known metadata trees or
1435 *                  the subvolume id which references this extent.
1436 *
1437 * @owner:          For data extents it is the inode number of the owning file.
1438 *                  For metadata extents this parameter holds the level in the
1439 *                  tree of the extent.
1440 *
1441 * @offset:         For metadata extents the offset is ignored and is currently
1442 *                  always passed as 0. For data extents it is the fileoffset
1443 *                  this extent belongs to.
1444 *
1445 * @refs_to_add     Number of references to add
1446 *
1447 * @extent_op       Pointer to a structure, holding information necessary when
1448 *                  updating a tree block's flags
1449 *
1450 */
1451static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1452                                  struct btrfs_delayed_ref_node *node,
1453                                  u64 parent, u64 root_objectid,
1454                                  u64 owner, u64 offset, int refs_to_add,
1455                                  struct btrfs_delayed_extent_op *extent_op)
1456{
1457        struct btrfs_path *path;
1458        struct extent_buffer *leaf;
1459        struct btrfs_extent_item *item;
1460        struct btrfs_key key;
1461        u64 bytenr = node->bytenr;
1462        u64 num_bytes = node->num_bytes;
1463        u64 refs;
1464        int ret;
1465
1466        path = btrfs_alloc_path();
1467        if (!path)
1468                return -ENOMEM;
1469
1470        /* this will setup the path even if it fails to insert the back ref */
1471        ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
1472                                           parent, root_objectid, owner,
1473                                           offset, refs_to_add, extent_op);
1474        if ((ret < 0 && ret != -EAGAIN) || !ret)
1475                goto out;
1476
1477        /*
1478         * Ok we had -EAGAIN which means we didn't have space to insert and
1479         * inline extent ref, so just update the reference count and add a
1480         * normal backref.
1481         */
1482        leaf = path->nodes[0];
1483        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1484        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1485        refs = btrfs_extent_refs(leaf, item);
1486        btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
1487        if (extent_op)
1488                __run_delayed_extent_op(extent_op, leaf, item);
1489
1490        btrfs_mark_buffer_dirty(leaf);
1491        btrfs_release_path(path);
1492
1493        /* now insert the actual backref */
1494        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1495                BUG_ON(refs_to_add != 1);
1496                ret = insert_tree_block_ref(trans, path, bytenr, parent,
1497                                            root_objectid);
1498        } else {
1499                ret = insert_extent_data_ref(trans, path, bytenr, parent,
1500                                             root_objectid, owner, offset,
1501                                             refs_to_add);
1502        }
1503        if (ret)
1504                btrfs_abort_transaction(trans, ret);
1505out:
1506        btrfs_free_path(path);
1507        return ret;
1508}
1509
1510static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
1511                                struct btrfs_delayed_ref_node *node,
1512                                struct btrfs_delayed_extent_op *extent_op,
1513                                int insert_reserved)
1514{
1515        int ret = 0;
1516        struct btrfs_delayed_data_ref *ref;
1517        struct btrfs_key ins;
1518        u64 parent = 0;
1519        u64 ref_root = 0;
1520        u64 flags = 0;
1521
1522        ins.objectid = node->bytenr;
1523        ins.offset = node->num_bytes;
1524        ins.type = BTRFS_EXTENT_ITEM_KEY;
1525
1526        ref = btrfs_delayed_node_to_data_ref(node);
1527        trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action);
1528
1529        if (node->type == BTRFS_SHARED_DATA_REF_KEY)
1530                parent = ref->parent;
1531        ref_root = ref->root;
1532
1533        if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1534                if (extent_op)
1535                        flags |= extent_op->flags_to_set;
1536                ret = alloc_reserved_file_extent(trans, parent, ref_root,
1537                                                 flags, ref->objectid,
1538                                                 ref->offset, &ins,
1539                                                 node->ref_mod);
1540        } else if (node->action == BTRFS_ADD_DELAYED_REF) {
1541                ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
1542                                             ref->objectid, ref->offset,
1543                                             node->ref_mod, extent_op);
1544        } else if (node->action == BTRFS_DROP_DELAYED_REF) {
1545                ret = __btrfs_free_extent(trans, node, parent,
1546                                          ref_root, ref->objectid,
1547                                          ref->offset, node->ref_mod,
1548                                          extent_op);
1549        } else {
1550                BUG();
1551        }
1552        return ret;
1553}
1554
1555static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
1556                                    struct extent_buffer *leaf,
1557                                    struct btrfs_extent_item *ei)
1558{
1559        u64 flags = btrfs_extent_flags(leaf, ei);
1560        if (extent_op->update_flags) {
1561                flags |= extent_op->flags_to_set;
1562                btrfs_set_extent_flags(leaf, ei, flags);
1563        }
1564
1565        if (extent_op->update_key) {
1566                struct btrfs_tree_block_info *bi;
1567                BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
1568                bi = (struct btrfs_tree_block_info *)(ei + 1);
1569                btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
1570        }
1571}
1572
1573static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
1574                                 struct btrfs_delayed_ref_head *head,
1575                                 struct btrfs_delayed_extent_op *extent_op)
1576{
1577        struct btrfs_fs_info *fs_info = trans->fs_info;
1578        struct btrfs_root *root;
1579        struct btrfs_key key;
1580        struct btrfs_path *path;
1581        struct btrfs_extent_item *ei;
1582        struct extent_buffer *leaf;
1583        u32 item_size;
1584        int ret;
1585        int err = 0;
1586        int metadata = 1;
1587
1588        if (TRANS_ABORTED(trans))
1589                return 0;
1590
1591        if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
1592                metadata = 0;
1593
1594        path = btrfs_alloc_path();
1595        if (!path)
1596                return -ENOMEM;
1597
1598        key.objectid = head->bytenr;
1599
1600        if (metadata) {
1601                key.type = BTRFS_METADATA_ITEM_KEY;
1602                key.offset = extent_op->level;
1603        } else {
1604                key.type = BTRFS_EXTENT_ITEM_KEY;
1605                key.offset = head->num_bytes;
1606        }
1607
1608        root = btrfs_extent_root(fs_info, key.objectid);
1609again:
1610        ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1611        if (ret < 0) {
1612                err = ret;
1613                goto out;
1614        }
1615        if (ret > 0) {
1616                if (metadata) {
1617                        if (path->slots[0] > 0) {
1618                                path->slots[0]--;
1619                                btrfs_item_key_to_cpu(path->nodes[0], &key,
1620                                                      path->slots[0]);
1621                                if (key.objectid == head->bytenr &&
1622                                    key.type == BTRFS_EXTENT_ITEM_KEY &&
1623                                    key.offset == head->num_bytes)
1624                                        ret = 0;
1625                        }
1626                        if (ret > 0) {
1627                                btrfs_release_path(path);
1628                                metadata = 0;
1629
1630                                key.objectid = head->bytenr;
1631                                key.offset = head->num_bytes;
1632                                key.type = BTRFS_EXTENT_ITEM_KEY;
1633                                goto again;
1634                        }
1635                } else {
1636                        err = -EIO;
1637                        goto out;
1638                }
1639        }
1640
1641        leaf = path->nodes[0];
1642        item_size = btrfs_item_size(leaf, path->slots[0]);
1643
1644        if (unlikely(item_size < sizeof(*ei))) {
1645                err = -EINVAL;
1646                btrfs_print_v0_err(fs_info);
1647                btrfs_abort_transaction(trans, err);
1648                goto out;
1649        }
1650
1651        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1652        __run_delayed_extent_op(extent_op, leaf, ei);
1653
1654        btrfs_mark_buffer_dirty(leaf);
1655out:
1656        btrfs_free_path(path);
1657        return err;
1658}
1659
1660static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
1661                                struct btrfs_delayed_ref_node *node,
1662                                struct btrfs_delayed_extent_op *extent_op,
1663                                int insert_reserved)
1664{
1665        int ret = 0;
1666        struct btrfs_delayed_tree_ref *ref;
1667        u64 parent = 0;
1668        u64 ref_root = 0;
1669
1670        ref = btrfs_delayed_node_to_tree_ref(node);
1671        trace_run_delayed_tree_ref(trans->fs_info, node, ref, node->action);
1672
1673        if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
1674                parent = ref->parent;
1675        ref_root = ref->root;
1676
1677        if (node->ref_mod != 1) {
1678                btrfs_err(trans->fs_info,
1679        "btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu",
1680                          node->bytenr, node->ref_mod, node->action, ref_root,
1681                          parent);
1682                return -EIO;
1683        }
1684        if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1685                BUG_ON(!extent_op || !extent_op->update_flags);
1686                ret = alloc_reserved_tree_block(trans, node, extent_op);
1687        } else if (node->action == BTRFS_ADD_DELAYED_REF) {
1688                ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
1689                                             ref->level, 0, 1, extent_op);
1690        } else if (node->action == BTRFS_DROP_DELAYED_REF) {
1691                ret = __btrfs_free_extent(trans, node, parent, ref_root,
1692                                          ref->level, 0, 1, extent_op);
1693        } else {
1694                BUG();
1695        }
1696        return ret;
1697}
1698
1699/* helper function to actually process a single delayed ref entry */
1700static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
1701                               struct btrfs_delayed_ref_node *node,
1702                               struct btrfs_delayed_extent_op *extent_op,
1703                               int insert_reserved)
1704{
1705        int ret = 0;
1706
1707        if (TRANS_ABORTED(trans)) {
1708                if (insert_reserved)
1709                        btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
1710                return 0;
1711        }
1712
1713        if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
1714            node->type == BTRFS_SHARED_BLOCK_REF_KEY)
1715                ret = run_delayed_tree_ref(trans, node, extent_op,
1716                                           insert_reserved);
1717        else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
1718                 node->type == BTRFS_SHARED_DATA_REF_KEY)
1719                ret = run_delayed_data_ref(trans, node, extent_op,
1720                                           insert_reserved);
1721        else
1722                BUG();
1723        if (ret && insert_reserved)
1724                btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
1725        return ret;
1726}
1727
1728static inline struct btrfs_delayed_ref_node *
1729select_delayed_ref(struct btrfs_delayed_ref_head *head)
1730{
1731        struct btrfs_delayed_ref_node *ref;
1732
1733        if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
1734                return NULL;
1735
1736        /*
1737         * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
1738         * This is to prevent a ref count from going down to zero, which deletes
1739         * the extent item from the extent tree, when there still are references
1740         * to add, which would fail because they would not find the extent item.
1741         */
1742        if (!list_empty(&head->ref_add_list))
1743                return list_first_entry(&head->ref_add_list,
1744                                struct btrfs_delayed_ref_node, add_list);
1745
1746        ref = rb_entry(rb_first_cached(&head->ref_tree),
1747                       struct btrfs_delayed_ref_node, ref_node);
1748        ASSERT(list_empty(&ref->add_list));
1749        return ref;
1750}
1751
1752static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
1753                                      struct btrfs_delayed_ref_head *head)
1754{
1755        spin_lock(&delayed_refs->lock);
1756        head->processing = 0;
1757        delayed_refs->num_heads_ready++;
1758        spin_unlock(&delayed_refs->lock);
1759        btrfs_delayed_ref_unlock(head);
1760}
1761
1762static struct btrfs_delayed_extent_op *cleanup_extent_op(
1763                                struct btrfs_delayed_ref_head *head)
1764{
1765        struct btrfs_delayed_extent_op *extent_op = head->extent_op;
1766
1767        if (!extent_op)
1768                return NULL;
1769
1770        if (head->must_insert_reserved) {
1771                head->extent_op = NULL;
1772                btrfs_free_delayed_extent_op(extent_op);
1773                return NULL;
1774        }
1775        return extent_op;
1776}
1777
1778static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans,
1779                                     struct btrfs_delayed_ref_head *head)
1780{
1781        struct btrfs_delayed_extent_op *extent_op;
1782        int ret;
1783
1784        extent_op = cleanup_extent_op(head);
1785        if (!extent_op)
1786                return 0;
1787        head->extent_op = NULL;
1788        spin_unlock(&head->lock);
1789        ret = run_delayed_extent_op(trans, head, extent_op);
1790        btrfs_free_delayed_extent_op(extent_op);
1791        return ret ? ret : 1;
1792}
1793
1794void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
1795                                  struct btrfs_delayed_ref_root *delayed_refs,
1796                                  struct btrfs_delayed_ref_head *head)
1797{
1798        int nr_items = 1;       /* Dropping this ref head update. */
1799
1800        /*
1801         * We had csum deletions accounted for in our delayed refs rsv, we need
1802         * to drop the csum leaves for this update from our delayed_refs_rsv.
1803         */
1804        if (head->total_ref_mod < 0 && head->is_data) {
1805                spin_lock(&delayed_refs->lock);
1806                delayed_refs->pending_csums -= head->num_bytes;
1807                spin_unlock(&delayed_refs->lock);
1808                nr_items += btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes);
1809        }
1810
1811        btrfs_delayed_refs_rsv_release(fs_info, nr_items);
1812}
1813
1814static int cleanup_ref_head(struct btrfs_trans_handle *trans,
1815                            struct btrfs_delayed_ref_head *head)
1816{
1817
1818        struct btrfs_fs_info *fs_info = trans->fs_info;
1819        struct btrfs_delayed_ref_root *delayed_refs;
1820        int ret;
1821
1822        delayed_refs = &trans->transaction->delayed_refs;
1823
1824        ret = run_and_cleanup_extent_op(trans, head);
1825        if (ret < 0) {
1826                unselect_delayed_ref_head(delayed_refs, head);
1827                btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
1828                return ret;
1829        } else if (ret) {
1830                return ret;
1831        }
1832
1833        /*
1834         * Need to drop our head ref lock and re-acquire the delayed ref lock
1835         * and then re-check to make sure nobody got added.
1836         */
1837        spin_unlock(&head->lock);
1838        spin_lock(&delayed_refs->lock);
1839        spin_lock(&head->lock);
1840        if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) {
1841                spin_unlock(&head->lock);
1842                spin_unlock(&delayed_refs->lock);
1843                return 1;
1844        }
1845        btrfs_delete_ref_head(delayed_refs, head);
1846        spin_unlock(&head->lock);
1847        spin_unlock(&delayed_refs->lock);
1848
1849        if (head->must_insert_reserved) {
1850                btrfs_pin_extent(trans, head->bytenr, head->num_bytes, 1);
1851                if (head->is_data) {
1852                        struct btrfs_root *csum_root;
1853
1854                        csum_root = btrfs_csum_root(fs_info, head->bytenr);
1855                        ret = btrfs_del_csums(trans, csum_root, head->bytenr,
1856                                              head->num_bytes);
1857                }
1858        }
1859
1860        btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
1861
1862        trace_run_delayed_ref_head(fs_info, head, 0);
1863        btrfs_delayed_ref_unlock(head);
1864        btrfs_put_delayed_ref_head(head);
1865        return ret;
1866}
1867
1868static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
1869                                        struct btrfs_trans_handle *trans)
1870{
1871        struct btrfs_delayed_ref_root *delayed_refs =
1872                &trans->transaction->delayed_refs;
1873        struct btrfs_delayed_ref_head *head = NULL;
1874        int ret;
1875
1876        spin_lock(&delayed_refs->lock);
1877        head = btrfs_select_ref_head(delayed_refs);
1878        if (!head) {
1879                spin_unlock(&delayed_refs->lock);
1880                return head;
1881        }
1882
1883        /*
1884         * Grab the lock that says we are going to process all the refs for
1885         * this head
1886         */
1887        ret = btrfs_delayed_ref_lock(delayed_refs, head);
1888        spin_unlock(&delayed_refs->lock);
1889
1890        /*
1891         * We may have dropped the spin lock to get the head mutex lock, and
1892         * that might have given someone else time to free the head.  If that's
1893         * true, it has been removed from our list and we can move on.
1894         */
1895        if (ret == -EAGAIN)
1896                head = ERR_PTR(-EAGAIN);
1897
1898        return head;
1899}
1900
1901static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
1902                                    struct btrfs_delayed_ref_head *locked_ref,
1903                                    unsigned long *run_refs)
1904{
1905        struct btrfs_fs_info *fs_info = trans->fs_info;
1906        struct btrfs_delayed_ref_root *delayed_refs;
1907        struct btrfs_delayed_extent_op *extent_op;
1908        struct btrfs_delayed_ref_node *ref;
1909        int must_insert_reserved = 0;
1910        int ret;
1911
1912        delayed_refs = &trans->transaction->delayed_refs;
1913
1914        lockdep_assert_held(&locked_ref->mutex);
1915        lockdep_assert_held(&locked_ref->lock);
1916
1917        while ((ref = select_delayed_ref(locked_ref))) {
1918                if (ref->seq &&
1919                    btrfs_check_delayed_seq(fs_info, ref->seq)) {
1920                        spin_unlock(&locked_ref->lock);
1921                        unselect_delayed_ref_head(delayed_refs, locked_ref);
1922                        return -EAGAIN;
1923                }
1924
1925                (*run_refs)++;
1926                ref->in_tree = 0;
1927                rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
1928                RB_CLEAR_NODE(&ref->ref_node);
1929                if (!list_empty(&ref->add_list))
1930                        list_del(&ref->add_list);
1931                /*
1932                 * When we play the delayed ref, also correct the ref_mod on
1933                 * head
1934                 */
1935                switch (ref->action) {
1936                case BTRFS_ADD_DELAYED_REF:
1937                case BTRFS_ADD_DELAYED_EXTENT:
1938                        locked_ref->ref_mod -= ref->ref_mod;
1939                        break;
1940                case BTRFS_DROP_DELAYED_REF:
1941                        locked_ref->ref_mod += ref->ref_mod;
1942                        break;
1943                default:
1944                        WARN_ON(1);
1945                }
1946                atomic_dec(&delayed_refs->num_entries);
1947
1948                /*
1949                 * Record the must_insert_reserved flag before we drop the
1950                 * spin lock.
1951                 */
1952                must_insert_reserved = locked_ref->must_insert_reserved;
1953                locked_ref->must_insert_reserved = 0;
1954
1955                extent_op = locked_ref->extent_op;
1956                locked_ref->extent_op = NULL;
1957                spin_unlock(&locked_ref->lock);
1958
1959                ret = run_one_delayed_ref(trans, ref, extent_op,
1960                                          must_insert_reserved);
1961
1962                btrfs_free_delayed_extent_op(extent_op);
1963                if (ret) {
1964                        unselect_delayed_ref_head(delayed_refs, locked_ref);
1965                        btrfs_put_delayed_ref(ref);
1966                        btrfs_debug(fs_info, "run_one_delayed_ref returned %d",
1967                                    ret);
1968                        return ret;
1969                }
1970
1971                btrfs_put_delayed_ref(ref);
1972                cond_resched();
1973
1974                spin_lock(&locked_ref->lock);
1975                btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
1976        }
1977
1978        return 0;
1979}
1980
1981/*
1982 * Returns 0 on success or if called with an already aborted transaction.
1983 * Returns -ENOMEM or -EIO on failure and will abort the transaction.
1984 */
1985static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
1986                                             unsigned long nr)
1987{
1988        struct btrfs_fs_info *fs_info = trans->fs_info;
1989        struct btrfs_delayed_ref_root *delayed_refs;
1990        struct btrfs_delayed_ref_head *locked_ref = NULL;
1991        ktime_t start = ktime_get();
1992        int ret;
1993        unsigned long count = 0;
1994        unsigned long actual_count = 0;
1995
1996        delayed_refs = &trans->transaction->delayed_refs;
1997        do {
1998                if (!locked_ref) {
1999                        locked_ref = btrfs_obtain_ref_head(trans);
2000                        if (IS_ERR_OR_NULL(locked_ref)) {
2001                                if (PTR_ERR(locked_ref) == -EAGAIN) {
2002                                        continue;
2003                                } else {
2004                                        break;
2005                                }
2006                        }
2007                        count++;
2008                }
2009                /*
2010                 * We need to try and merge add/drops of the same ref since we
2011                 * can run into issues with relocate dropping the implicit ref
2012                 * and then it being added back again before the drop can
2013                 * finish.  If we merged anything we need to re-loop so we can
2014                 * get a good ref.
2015                 * Or we can get node references of the same type that weren't
2016                 * merged when created due to bumps in the tree mod seq, and
2017                 * we need to merge them to prevent adding an inline extent
2018                 * backref before dropping it (triggering a BUG_ON at
2019                 * insert_inline_extent_backref()).
2020                 */
2021                spin_lock(&locked_ref->lock);
2022                btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
2023
2024                ret = btrfs_run_delayed_refs_for_head(trans, locked_ref,
2025                                                      &actual_count);
2026                if (ret < 0 && ret != -EAGAIN) {
2027                        /*
2028                         * Error, btrfs_run_delayed_refs_for_head already
2029                         * unlocked everything so just bail out
2030                         */
2031                        return ret;
2032                } else if (!ret) {
2033                        /*
2034                         * Success, perform the usual cleanup of a processed
2035                         * head
2036                         */
2037                        ret = cleanup_ref_head(trans, locked_ref);
2038                        if (ret > 0 ) {
2039                                /* We dropped our lock, we need to loop. */
2040                                ret = 0;
2041                                continue;
2042                        } else if (ret) {
2043                                return ret;
2044                        }
2045                }
2046
2047                /*
2048                 * Either success case or btrfs_run_delayed_refs_for_head
2049                 * returned -EAGAIN, meaning we need to select another head
2050                 */
2051
2052                locked_ref = NULL;
2053                cond_resched();
2054        } while ((nr != -1 && count < nr) || locked_ref);
2055
2056        /*
2057         * We don't want to include ref heads since we can have empty ref heads
2058         * and those will drastically skew our runtime down since we just do
2059         * accounting, no actual extent tree updates.
2060         */
2061        if (actual_count > 0) {
2062                u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
2063                u64 avg;
2064
2065                /*
2066                 * We weigh the current average higher than our current runtime
2067                 * to avoid large swings in the average.
2068                 */
2069                spin_lock(&delayed_refs->lock);
2070                avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
2071                fs_info->avg_delayed_ref_runtime = avg >> 2;    /* div by 4 */
2072                spin_unlock(&delayed_refs->lock);
2073        }
2074        return 0;
2075}
2076
2077#ifdef SCRAMBLE_DELAYED_REFS
2078/*
2079 * Normally delayed refs get processed in ascending bytenr order. This
2080 * correlates in most cases to the order added. To expose dependencies on this
2081 * order, we start to process the tree in the middle instead of the beginning
2082 */
2083static u64 find_middle(struct rb_root *root)
2084{
2085        struct rb_node *n = root->rb_node;
2086        struct btrfs_delayed_ref_node *entry;
2087        int alt = 1;
2088        u64 middle;
2089        u64 first = 0, last = 0;
2090
2091        n = rb_first(root);
2092        if (n) {
2093                entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2094                first = entry->bytenr;
2095        }
2096        n = rb_last(root);
2097        if (n) {
2098                entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2099                last = entry->bytenr;
2100        }
2101        n = root->rb_node;
2102
2103        while (n) {
2104                entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2105                WARN_ON(!entry->in_tree);
2106
2107                middle = entry->bytenr;
2108
2109                if (alt)
2110                        n = n->rb_left;
2111                else
2112                        n = n->rb_right;
2113
2114                alt = 1 - alt;
2115        }
2116        return middle;
2117}
2118#endif
2119
2120/*
2121 * this starts processing the delayed reference count updates and
2122 * extent insertions we have queued up so far.  count can be
2123 * 0, which means to process everything in the tree at the start
2124 * of the run (but not newly added entries), or it can be some target
2125 * number you'd like to process.
2126 *
2127 * Returns 0 on success or if called with an aborted transaction
2128 * Returns <0 on error and aborts the transaction
2129 */
2130int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2131                           unsigned long count)
2132{
2133        struct btrfs_fs_info *fs_info = trans->fs_info;
2134        struct rb_node *node;
2135        struct btrfs_delayed_ref_root *delayed_refs;
2136        struct btrfs_delayed_ref_head *head;
2137        int ret;
2138        int run_all = count == (unsigned long)-1;
2139
2140        /* We'll clean this up in btrfs_cleanup_transaction */
2141        if (TRANS_ABORTED(trans))
2142                return 0;
2143
2144        if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
2145                return 0;
2146
2147        delayed_refs = &trans->transaction->delayed_refs;
2148        if (count == 0)
2149                count = delayed_refs->num_heads_ready;
2150
2151again:
2152#ifdef SCRAMBLE_DELAYED_REFS
2153        delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2154#endif
2155        ret = __btrfs_run_delayed_refs(trans, count);
2156        if (ret < 0) {
2157                btrfs_abort_transaction(trans, ret);
2158                return ret;
2159        }
2160
2161        if (run_all) {
2162                btrfs_create_pending_block_groups(trans);
2163
2164                spin_lock(&delayed_refs->lock);
2165                node = rb_first_cached(&delayed_refs->href_root);
2166                if (!node) {
2167                        spin_unlock(&delayed_refs->lock);
2168                        goto out;
2169                }
2170                head = rb_entry(node, struct btrfs_delayed_ref_head,
2171                                href_node);
2172                refcount_inc(&head->refs);
2173                spin_unlock(&delayed_refs->lock);
2174
2175                /* Mutex was contended, block until it's released and retry. */
2176                mutex_lock(&head->mutex);
2177                mutex_unlock(&head->mutex);
2178
2179                btrfs_put_delayed_ref_head(head);
2180                cond_resched();
2181                goto again;
2182        }
2183out:
2184        return 0;
2185}
2186
2187int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2188                                struct extent_buffer *eb, u64 flags,
2189                                int level)
2190{
2191        struct btrfs_delayed_extent_op *extent_op;
2192        int ret;
2193
2194        extent_op = btrfs_alloc_delayed_extent_op();
2195        if (!extent_op)
2196                return -ENOMEM;
2197
2198        extent_op->flags_to_set = flags;
2199        extent_op->update_flags = true;
2200        extent_op->update_key = false;
2201        extent_op->level = level;
2202
2203        ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, extent_op);
2204        if (ret)
2205                btrfs_free_delayed_extent_op(extent_op);
2206        return ret;
2207}
2208
2209static noinline int check_delayed_ref(struct btrfs_root *root,
2210                                      struct btrfs_path *path,
2211                                      u64 objectid, u64 offset, u64 bytenr)
2212{
2213        struct btrfs_delayed_ref_head *head;
2214        struct btrfs_delayed_ref_node *ref;
2215        struct btrfs_delayed_data_ref *data_ref;
2216        struct btrfs_delayed_ref_root *delayed_refs;
2217        struct btrfs_transaction *cur_trans;
2218        struct rb_node *node;
2219        int ret = 0;
2220
2221        spin_lock(&root->fs_info->trans_lock);
2222        cur_trans = root->fs_info->running_transaction;
2223        if (cur_trans)
2224                refcount_inc(&cur_trans->use_count);
2225        spin_unlock(&root->fs_info->trans_lock);
2226        if (!cur_trans)
2227                return 0;
2228
2229        delayed_refs = &cur_trans->delayed_refs;
2230        spin_lock(&delayed_refs->lock);
2231        head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
2232        if (!head) {
2233                spin_unlock(&delayed_refs->lock);
2234                btrfs_put_transaction(cur_trans);
2235                return 0;
2236        }
2237
2238        if (!mutex_trylock(&head->mutex)) {
2239                refcount_inc(&head->refs);
2240                spin_unlock(&delayed_refs->lock);
2241
2242                btrfs_release_path(path);
2243
2244                /*
2245                 * Mutex was contended, block until it's released and let
2246                 * caller try again
2247                 */
2248                mutex_lock(&head->mutex);
2249                mutex_unlock(&head->mutex);
2250                btrfs_put_delayed_ref_head(head);
2251                btrfs_put_transaction(cur_trans);
2252                return -EAGAIN;
2253        }
2254        spin_unlock(&delayed_refs->lock);
2255
2256        spin_lock(&head->lock);
2257        /*
2258         * XXX: We should replace this with a proper search function in the
2259         * future.
2260         */
2261        for (node = rb_first_cached(&head->ref_tree); node;
2262             node = rb_next(node)) {
2263                ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
2264                /* If it's a shared ref we know a cross reference exists */
2265                if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
2266                        ret = 1;
2267                        break;
2268                }
2269
2270                data_ref = btrfs_delayed_node_to_data_ref(ref);
2271
2272                /*
2273                 * If our ref doesn't match the one we're currently looking at
2274                 * then we have a cross reference.
2275                 */
2276                if (data_ref->root != root->root_key.objectid ||
2277                    data_ref->objectid != objectid ||
2278                    data_ref->offset != offset) {
2279                        ret = 1;
2280                        break;
2281                }
2282        }
2283        spin_unlock(&head->lock);
2284        mutex_unlock(&head->mutex);
2285        btrfs_put_transaction(cur_trans);
2286        return ret;
2287}
2288
2289static noinline int check_committed_ref(struct btrfs_root *root,
2290                                        struct btrfs_path *path,
2291                                        u64 objectid, u64 offset, u64 bytenr,
2292                                        bool strict)
2293{
2294        struct btrfs_fs_info *fs_info = root->fs_info;
2295        struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr);
2296        struct extent_buffer *leaf;
2297        struct btrfs_extent_data_ref *ref;
2298        struct btrfs_extent_inline_ref *iref;
2299        struct btrfs_extent_item *ei;
2300        struct btrfs_key key;
2301        u32 item_size;
2302        int type;
2303        int ret;
2304
2305        key.objectid = bytenr;
2306        key.offset = (u64)-1;
2307        key.type = BTRFS_EXTENT_ITEM_KEY;
2308
2309        ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2310        if (ret < 0)
2311                goto out;
2312        BUG_ON(ret == 0); /* Corruption */
2313
2314        ret = -ENOENT;
2315        if (path->slots[0] == 0)
2316                goto out;
2317
2318        path->slots[0]--;
2319        leaf = path->nodes[0];
2320        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2321
2322        if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
2323                goto out;
2324
2325        ret = 1;
2326        item_size = btrfs_item_size(leaf, path->slots[0]);
2327        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2328
2329        /* If extent item has more than 1 inline ref then it's shared */
2330        if (item_size != sizeof(*ei) +
2331            btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
2332                goto out;
2333
2334        /*
2335         * If extent created before last snapshot => it's shared unless the
2336         * snapshot has been deleted. Use the heuristic if strict is false.
2337         */
2338        if (!strict &&
2339            (btrfs_extent_generation(leaf, ei) <=
2340             btrfs_root_last_snapshot(&root->root_item)))
2341                goto out;
2342
2343        iref = (struct btrfs_extent_inline_ref *)(ei + 1);
2344
2345        /* If this extent has SHARED_DATA_REF then it's shared */
2346        type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
2347        if (type != BTRFS_EXTENT_DATA_REF_KEY)
2348                goto out;
2349
2350        ref = (struct btrfs_extent_data_ref *)(&iref->offset);
2351        if (btrfs_extent_refs(leaf, ei) !=
2352            btrfs_extent_data_ref_count(leaf, ref) ||
2353            btrfs_extent_data_ref_root(leaf, ref) !=
2354            root->root_key.objectid ||
2355            btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
2356            btrfs_extent_data_ref_offset(leaf, ref) != offset)
2357                goto out;
2358
2359        ret = 0;
2360out:
2361        return ret;
2362}
2363
2364int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
2365                          u64 bytenr, bool strict, struct btrfs_path *path)
2366{
2367        int ret;
2368
2369        do {
2370                ret = check_committed_ref(root, path, objectid,
2371                                          offset, bytenr, strict);
2372                if (ret && ret != -ENOENT)
2373                        goto out;
2374
2375                ret = check_delayed_ref(root, path, objectid, offset, bytenr);
2376        } while (ret == -EAGAIN);
2377
2378out:
2379        btrfs_release_path(path);
2380        if (btrfs_is_data_reloc_root(root))
2381                WARN_ON(ret > 0);
2382        return ret;
2383}
2384
2385static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2386                           struct btrfs_root *root,
2387                           struct extent_buffer *buf,
2388                           int full_backref, int inc)
2389{
2390        struct btrfs_fs_info *fs_info = root->fs_info;
2391        u64 bytenr;
2392        u64 num_bytes;
2393        u64 parent;
2394        u64 ref_root;
2395        u32 nritems;
2396        struct btrfs_key key;
2397        struct btrfs_file_extent_item *fi;
2398        struct btrfs_ref generic_ref = { 0 };
2399        bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC);
2400        int i;
2401        int action;
2402        int level;
2403        int ret = 0;
2404
2405        if (btrfs_is_testing(fs_info))
2406                return 0;
2407
2408        ref_root = btrfs_header_owner(buf);
2409        nritems = btrfs_header_nritems(buf);
2410        level = btrfs_header_level(buf);
2411
2412        if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && level == 0)
2413                return 0;
2414
2415        if (full_backref)
2416                parent = buf->start;
2417        else
2418                parent = 0;
2419        if (inc)
2420                action = BTRFS_ADD_DELAYED_REF;
2421        else
2422                action = BTRFS_DROP_DELAYED_REF;
2423
2424        for (i = 0; i < nritems; i++) {
2425                if (level == 0) {
2426                        btrfs_item_key_to_cpu(buf, &key, i);
2427                        if (key.type != BTRFS_EXTENT_DATA_KEY)
2428                                continue;
2429                        fi = btrfs_item_ptr(buf, i,
2430                                            struct btrfs_file_extent_item);
2431                        if (btrfs_file_extent_type(buf, fi) ==
2432                            BTRFS_FILE_EXTENT_INLINE)
2433                                continue;
2434                        bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
2435                        if (bytenr == 0)
2436                                continue;
2437
2438                        num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
2439                        key.offset -= btrfs_file_extent_offset(buf, fi);
2440                        btrfs_init_generic_ref(&generic_ref, action, bytenr,
2441                                               num_bytes, parent);
2442                        btrfs_init_data_ref(&generic_ref, ref_root, key.objectid,
2443                                            key.offset, root->root_key.objectid,
2444                                            for_reloc);
2445                        if (inc)
2446                                ret = btrfs_inc_extent_ref(trans, &generic_ref);
2447                        else
2448                                ret = btrfs_free_extent(trans, &generic_ref);
2449                        if (ret)
2450                                goto fail;
2451                } else {
2452                        bytenr = btrfs_node_blockptr(buf, i);
2453                        num_bytes = fs_info->nodesize;
2454                        btrfs_init_generic_ref(&generic_ref, action, bytenr,
2455                                               num_bytes, parent);
2456                        btrfs_init_tree_ref(&generic_ref, level - 1, ref_root,
2457                                            root->root_key.objectid, for_reloc);
2458                        if (inc)
2459                                ret = btrfs_inc_extent_ref(trans, &generic_ref);
2460                        else
2461                                ret = btrfs_free_extent(trans, &generic_ref);
2462                        if (ret)
2463                                goto fail;
2464                }
2465        }
2466        return 0;
2467fail:
2468        return ret;
2469}
2470
2471int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2472                  struct extent_buffer *buf, int full_backref)
2473{
2474        return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
2475}
2476
2477int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2478                  struct extent_buffer *buf, int full_backref)
2479{
2480        return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
2481}
2482
2483static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
2484{
2485        struct btrfs_fs_info *fs_info = root->fs_info;
2486        u64 flags;
2487        u64 ret;
2488
2489        if (data)
2490                flags = BTRFS_BLOCK_GROUP_DATA;
2491        else if (root == fs_info->chunk_root)
2492                flags = BTRFS_BLOCK_GROUP_SYSTEM;
2493        else
2494                flags = BTRFS_BLOCK_GROUP_METADATA;
2495
2496        ret = btrfs_get_alloc_profile(fs_info, flags);
2497        return ret;
2498}
2499
2500static u64 first_logical_byte(struct btrfs_fs_info *fs_info)
2501{
2502        struct rb_node *leftmost;
2503        u64 bytenr = 0;
2504
2505        read_lock(&fs_info->block_group_cache_lock);
2506        /* Get the block group with the lowest logical start address. */
2507        leftmost = rb_first_cached(&fs_info->block_group_cache_tree);
2508        if (leftmost) {
2509                struct btrfs_block_group *bg;
2510
2511                bg = rb_entry(leftmost, struct btrfs_block_group, cache_node);
2512                bytenr = bg->start;
2513        }
2514        read_unlock(&fs_info->block_group_cache_lock);
2515
2516        return bytenr;
2517}
2518
2519static int pin_down_extent(struct btrfs_trans_handle *trans,
2520                           struct btrfs_block_group *cache,
2521                           u64 bytenr, u64 num_bytes, int reserved)
2522{
2523        struct btrfs_fs_info *fs_info = cache->fs_info;
2524
2525        spin_lock(&cache->space_info->lock);
2526        spin_lock(&cache->lock);
2527        cache->pinned += num_bytes;
2528        btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info,
2529                                             num_bytes);
2530        if (reserved) {
2531                cache->reserved -= num_bytes;
2532                cache->space_info->bytes_reserved -= num_bytes;
2533        }
2534        spin_unlock(&cache->lock);
2535        spin_unlock(&cache->space_info->lock);
2536
2537        set_extent_dirty(&trans->transaction->pinned_extents, bytenr,
2538                         bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
2539        return 0;
2540}
2541
2542int btrfs_pin_extent(struct btrfs_trans_handle *trans,
2543                     u64 bytenr, u64 num_bytes, int reserved)
2544{
2545        struct btrfs_block_group *cache;
2546
2547        cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
2548        BUG_ON(!cache); /* Logic error */
2549
2550        pin_down_extent(trans, cache, bytenr, num_bytes, reserved);
2551
2552        btrfs_put_block_group(cache);
2553        return 0;
2554}
2555
2556/*
2557 * this function must be called within transaction
2558 */
2559int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
2560                                    u64 bytenr, u64 num_bytes)
2561{
2562        struct btrfs_block_group *cache;
2563        int ret;
2564
2565        cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
2566        if (!cache)
2567                return -EINVAL;
2568
2569        /*
2570         * pull in the free space cache (if any) so that our pin
2571         * removes the free space from the cache.  We have load_only set
2572         * to one because the slow code to read in the free extents does check
2573         * the pinned extents.
2574         */
2575        btrfs_cache_block_group(cache, 1);
2576        /*
2577         * Make sure we wait until the cache is completely built in case it is
2578         * missing or is invalid and therefore needs to be rebuilt.
2579         */
2580        ret = btrfs_wait_block_group_cache_done(cache);
2581        if (ret)
2582                goto out;
2583
2584        pin_down_extent(trans, cache, bytenr, num_bytes, 0);
2585
2586        /* remove us from the free space cache (if we're there at all) */
2587        ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
2588out:
2589        btrfs_put_block_group(cache);
2590        return ret;
2591}
2592
2593static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
2594                                   u64 start, u64 num_bytes)
2595{
2596        int ret;
2597        struct btrfs_block_group *block_group;
2598
2599        block_group = btrfs_lookup_block_group(fs_info, start);
2600        if (!block_group)
2601                return -EINVAL;
2602
2603        btrfs_cache_block_group(block_group, 1);
2604        /*
2605         * Make sure we wait until the cache is completely built in case it is
2606         * missing or is invalid and therefore needs to be rebuilt.
2607         */
2608        ret = btrfs_wait_block_group_cache_done(block_group);
2609        if (ret)
2610                goto out;
2611
2612        ret = btrfs_remove_free_space(block_group, start, num_bytes);
2613out:
2614        btrfs_put_block_group(block_group);
2615        return ret;
2616}
2617
2618int btrfs_exclude_logged_extents(struct extent_buffer *eb)
2619{
2620        struct btrfs_fs_info *fs_info = eb->fs_info;
2621        struct btrfs_file_extent_item *item;
2622        struct btrfs_key key;
2623        int found_type;
2624        int i;
2625        int ret = 0;
2626
2627        if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS))
2628                return 0;
2629
2630        for (i = 0; i < btrfs_header_nritems(eb); i++) {
2631                btrfs_item_key_to_cpu(eb, &key, i);
2632                if (key.type != BTRFS_EXTENT_DATA_KEY)
2633                        continue;
2634                item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
2635                found_type = btrfs_file_extent_type(eb, item);
2636                if (found_type == BTRFS_FILE_EXTENT_INLINE)
2637                        continue;
2638                if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
2639                        continue;
2640                key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
2641                key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
2642                ret = __exclude_logged_extent(fs_info, key.objectid, key.offset);
2643                if (ret)
2644                        break;
2645        }
2646
2647        return ret;
2648}
2649
2650static void
2651btrfs_inc_block_group_reservations(struct btrfs_block_group *bg)
2652{
2653        atomic_inc(&bg->reservations);
2654}
2655
2656/*
2657 * Returns the free cluster for the given space info and sets empty_cluster to
2658 * what it should be based on the mount options.
2659 */
2660static struct btrfs_free_cluster *
2661fetch_cluster_info(struct btrfs_fs_info *fs_info,
2662                   struct btrfs_space_info *space_info, u64 *empty_cluster)
2663{
2664        struct btrfs_free_cluster *ret = NULL;
2665
2666        *empty_cluster = 0;
2667        if (btrfs_mixed_space_info(space_info))
2668                return ret;
2669
2670        if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
2671                ret = &fs_info->meta_alloc_cluster;
2672                if (btrfs_test_opt(fs_info, SSD))
2673                        *empty_cluster = SZ_2M;
2674                else
2675                        *empty_cluster = SZ_64K;
2676        } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) &&
2677                   btrfs_test_opt(fs_info, SSD_SPREAD)) {
2678                *empty_cluster = SZ_2M;
2679                ret = &fs_info->data_alloc_cluster;
2680        }
2681
2682        return ret;
2683}
2684
2685static int unpin_extent_range(struct btrfs_fs_info *fs_info,
2686                              u64 start, u64 end,
2687                              const bool return_free_space)
2688{
2689        struct btrfs_block_group *cache = NULL;
2690        struct btrfs_space_info *space_info;
2691        struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
2692        struct btrfs_free_cluster *cluster = NULL;
2693        u64 len;
2694        u64 total_unpinned = 0;
2695        u64 empty_cluster = 0;
2696        bool readonly;
2697
2698        while (start <= end) {
2699                readonly = false;
2700                if (!cache ||
2701                    start >= cache->start + cache->length) {
2702                        if (cache)
2703                                btrfs_put_block_group(cache);
2704                        total_unpinned = 0;
2705                        cache = btrfs_lookup_block_group(fs_info, start);
2706                        BUG_ON(!cache); /* Logic error */
2707
2708                        cluster = fetch_cluster_info(fs_info,
2709                                                     cache->space_info,
2710                                                     &empty_cluster);
2711                        empty_cluster <<= 1;
2712                }
2713
2714                len = cache->start + cache->length - start;
2715                len = min(len, end + 1 - start);
2716
2717                down_read(&fs_info->commit_root_sem);
2718                if (start < cache->last_byte_to_unpin && return_free_space) {
2719                        u64 add_len = min(len, cache->last_byte_to_unpin - start);
2720
2721                        btrfs_add_free_space(cache, start, add_len);
2722                }
2723                up_read(&fs_info->commit_root_sem);
2724
2725                start += len;
2726                total_unpinned += len;
2727                space_info = cache->space_info;
2728
2729                /*
2730                 * If this space cluster has been marked as fragmented and we've
2731                 * unpinned enough in this block group to potentially allow a
2732                 * cluster to be created inside of it go ahead and clear the
2733                 * fragmented check.
2734                 */
2735                if (cluster && cluster->fragmented &&
2736                    total_unpinned > empty_cluster) {
2737                        spin_lock(&cluster->lock);
2738                        cluster->fragmented = 0;
2739                        spin_unlock(&cluster->lock);
2740                }
2741
2742                spin_lock(&space_info->lock);
2743                spin_lock(&cache->lock);
2744                cache->pinned -= len;
2745                btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len);
2746                space_info->max_extent_size = 0;
2747                if (cache->ro) {
2748                        space_info->bytes_readonly += len;
2749                        readonly = true;
2750                } else if (btrfs_is_zoned(fs_info)) {
2751                        /* Need reset before reusing in a zoned block group */
2752                        space_info->bytes_zone_unusable += len;
2753                        readonly = true;
2754                }
2755                spin_unlock(&cache->lock);
2756                if (!readonly && return_free_space &&
2757                    global_rsv->space_info == space_info) {
2758                        spin_lock(&global_rsv->lock);
2759                        if (!global_rsv->full) {
2760                                u64 to_add = min(len, global_rsv->size -
2761                                                      global_rsv->reserved);
2762
2763                                global_rsv->reserved += to_add;
2764                                btrfs_space_info_update_bytes_may_use(fs_info,
2765                                                space_info, to_add);
2766                                if (global_rsv->reserved >= global_rsv->size)
2767                                        global_rsv->full = 1;
2768                                len -= to_add;
2769                        }
2770                        spin_unlock(&global_rsv->lock);
2771                }
2772                /* Add to any tickets we may have */
2773                if (!readonly && return_free_space && len)
2774                        btrfs_try_granting_tickets(fs_info, space_info);
2775                spin_unlock(&space_info->lock);
2776        }
2777
2778        if (cache)
2779                btrfs_put_block_group(cache);
2780        return 0;
2781}
2782
2783int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
2784{
2785        struct btrfs_fs_info *fs_info = trans->fs_info;
2786        struct btrfs_block_group *block_group, *tmp;
2787        struct list_head *deleted_bgs;
2788        struct extent_io_tree *unpin;
2789        u64 start;
2790        u64 end;
2791        int ret;
2792
2793        unpin = &trans->transaction->pinned_extents;
2794
2795        while (!TRANS_ABORTED(trans)) {
2796                struct extent_state *cached_state = NULL;
2797
2798                mutex_lock(&fs_info->unused_bg_unpin_mutex);
2799                ret = find_first_extent_bit(unpin, 0, &start, &end,
2800                                            EXTENT_DIRTY, &cached_state);
2801                if (ret) {
2802                        mutex_unlock(&fs_info->unused_bg_unpin_mutex);
2803                        break;
2804                }
2805
2806                if (btrfs_test_opt(fs_info, DISCARD_SYNC))
2807                        ret = btrfs_discard_extent(fs_info, start,
2808                                                   end + 1 - start, NULL);
2809
2810                clear_extent_dirty(unpin, start, end, &cached_state);
2811                unpin_extent_range(fs_info, start, end, true);
2812                mutex_unlock(&fs_info->unused_bg_unpin_mutex);
2813                free_extent_state(cached_state);
2814                cond_resched();
2815        }
2816
2817        if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) {
2818                btrfs_discard_calc_delay(&fs_info->discard_ctl);
2819                btrfs_discard_schedule_work(&fs_info->discard_ctl, true);
2820        }
2821
2822        /*
2823         * Transaction is finished.  We don't need the lock anymore.  We
2824         * do need to clean up the block groups in case of a transaction
2825         * abort.
2826         */
2827        deleted_bgs = &trans->transaction->deleted_bgs;
2828        list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
2829                u64 trimmed = 0;
2830
2831                ret = -EROFS;
2832                if (!TRANS_ABORTED(trans))
2833                        ret = btrfs_discard_extent(fs_info,
2834                                                   block_group->start,
2835                                                   block_group->length,
2836                                                   &trimmed);
2837
2838                list_del_init(&block_group->bg_list);
2839                btrfs_unfreeze_block_group(block_group);
2840                btrfs_put_block_group(block_group);
2841
2842                if (ret) {
2843                        const char *errstr = btrfs_decode_error(ret);
2844                        btrfs_warn(fs_info,
2845                           "discard failed while removing blockgroup: errno=%d %s",
2846                                   ret, errstr);
2847                }
2848        }
2849
2850        return 0;
2851}
2852
2853static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
2854                                     u64 bytenr, u64 num_bytes, bool is_data)
2855{
2856        int ret;
2857
2858        if (is_data) {
2859                struct btrfs_root *csum_root;
2860
2861                csum_root = btrfs_csum_root(trans->fs_info, bytenr);
2862                ret = btrfs_del_csums(trans, csum_root, bytenr, num_bytes);
2863                if (ret) {
2864                        btrfs_abort_transaction(trans, ret);
2865                        return ret;
2866                }
2867        }
2868
2869        ret = add_to_free_space_tree(trans, bytenr, num_bytes);
2870        if (ret) {
2871                btrfs_abort_transaction(trans, ret);
2872                return ret;
2873        }
2874
2875        ret = btrfs_update_block_group(trans, bytenr, num_bytes, false);
2876        if (ret)
2877                btrfs_abort_transaction(trans, ret);
2878
2879        return ret;
2880}
2881
2882/*
2883 * Drop one or more refs of @node.
2884 *
2885 * 1. Locate the extent refs.
2886 *    It's either inline in EXTENT/METADATA_ITEM or in keyed SHARED_* item.
2887 *    Locate it, then reduce the refs number or remove the ref line completely.
2888 *
2889 * 2. Update the refs count in EXTENT/METADATA_ITEM
2890 *
2891 * Inline backref case:
2892 *
2893 * in extent tree we have:
2894 *
2895 *      item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
2896 *              refs 2 gen 6 flags DATA
2897 *              extent data backref root FS_TREE objectid 258 offset 0 count 1
2898 *              extent data backref root FS_TREE objectid 257 offset 0 count 1
2899 *
2900 * This function gets called with:
2901 *
2902 *    node->bytenr = 13631488
2903 *    node->num_bytes = 1048576
2904 *    root_objectid = FS_TREE
2905 *    owner_objectid = 257
2906 *    owner_offset = 0
2907 *    refs_to_drop = 1
2908 *
2909 * Then we should get some like:
2910 *
2911 *      item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
2912 *              refs 1 gen 6 flags DATA
2913 *              extent data backref root FS_TREE objectid 258 offset 0 count 1
2914 *
2915 * Keyed backref case:
2916 *
2917 * in extent tree we have:
2918 *
2919 *      item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
2920 *              refs 754 gen 6 flags DATA
2921 *      [...]
2922 *      item 2 key (13631488 EXTENT_DATA_REF <HASH>) itemoff 3915 itemsize 28
2923 *              extent data backref root FS_TREE objectid 866 offset 0 count 1
2924 *
2925 * This function get called with:
2926 *
2927 *    node->bytenr = 13631488
2928 *    node->num_bytes = 1048576
2929 *    root_objectid = FS_TREE
2930 *    owner_objectid = 866
2931 *    owner_offset = 0
2932 *    refs_to_drop = 1
2933 *
2934 * Then we should get some like:
2935 *
2936 *      item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
2937 *              refs 753 gen 6 flags DATA
2938 *
2939 * And that (13631488 EXTENT_DATA_REF <HASH>) gets removed.
2940 */
2941static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
2942                               struct btrfs_delayed_ref_node *node, u64 parent,
2943                               u64 root_objectid, u64 owner_objectid,
2944                               u64 owner_offset, int refs_to_drop,
2945                               struct btrfs_delayed_extent_op *extent_op)
2946{
2947        struct btrfs_fs_info *info = trans->fs_info;
2948        struct btrfs_key key;
2949        struct btrfs_path *path;
2950        struct btrfs_root *extent_root;
2951        struct extent_buffer *leaf;
2952        struct btrfs_extent_item *ei;
2953        struct btrfs_extent_inline_ref *iref;
2954        int ret;
2955        int is_data;
2956        int extent_slot = 0;
2957        int found_extent = 0;
2958        int num_to_del = 1;
2959        u32 item_size;
2960        u64 refs;
2961        u64 bytenr = node->bytenr;
2962        u64 num_bytes = node->num_bytes;
2963        bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
2964
2965        extent_root = btrfs_extent_root(info, bytenr);
2966        ASSERT(extent_root);
2967
2968        path = btrfs_alloc_path();
2969        if (!path)
2970                return -ENOMEM;
2971
2972        is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
2973
2974        if (!is_data && refs_to_drop != 1) {
2975                btrfs_crit(info,
2976"invalid refs_to_drop, dropping more than 1 refs for tree block %llu refs_to_drop %u",
2977                           node->bytenr, refs_to_drop);
2978                ret = -EINVAL;
2979                btrfs_abort_transaction(trans, ret);
2980                goto out;
2981        }
2982
2983        if (is_data)
2984                skinny_metadata = false;
2985
2986        ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes,
2987                                    parent, root_objectid, owner_objectid,
2988                                    owner_offset);
2989        if (ret == 0) {
2990                /*
2991                 * Either the inline backref or the SHARED_DATA_REF/
2992                 * SHARED_BLOCK_REF is found
2993                 *
2994                 * Here is a quick path to locate EXTENT/METADATA_ITEM.
2995                 * It's possible the EXTENT/METADATA_ITEM is near current slot.
2996                 */
2997                extent_slot = path->slots[0];
2998                while (extent_slot >= 0) {
2999                        btrfs_item_key_to_cpu(path->nodes[0], &key,
3000                                              extent_slot);
3001                        if (key.objectid != bytenr)
3002                                break;
3003                        if (key.type == BTRFS_EXTENT_ITEM_KEY &&
3004                            key.offset == num_bytes) {
3005                                found_extent = 1;
3006                                break;
3007                        }
3008                        if (key.type == BTRFS_METADATA_ITEM_KEY &&
3009                            key.offset == owner_objectid) {
3010                                found_extent = 1;
3011                                break;
3012                        }
3013
3014                        /* Quick path didn't find the EXTEMT/METADATA_ITEM */
3015                        if (path->slots[0] - extent_slot > 5)
3016                                break;
3017                        extent_slot--;
3018                }
3019
3020                if (!found_extent) {
3021                        if (iref) {
3022                                btrfs_crit(info,
3023"invalid iref, no EXTENT/METADATA_ITEM found but has inline extent ref");
3024                                btrfs_abort_transaction(trans, -EUCLEAN);
3025                                goto err_dump;
3026                        }
3027                        /* Must be SHARED_* item, remove the backref first */
3028                        ret = remove_extent_backref(trans, extent_root, path,
3029                                                    NULL, refs_to_drop, is_data);
3030                        if (ret) {
3031                                btrfs_abort_transaction(trans, ret);
3032                                goto out;
3033                        }
3034                        btrfs_release_path(path);
3035
3036                        /* Slow path to locate EXTENT/METADATA_ITEM */
3037                        key.objectid = bytenr;
3038                        key.type = BTRFS_EXTENT_ITEM_KEY;
3039                        key.offset = num_bytes;
3040
3041                        if (!is_data && skinny_metadata) {
3042                                key.type = BTRFS_METADATA_ITEM_KEY;
3043                                key.offset = owner_objectid;
3044                        }
3045
3046                        ret = btrfs_search_slot(trans, extent_root,
3047                                                &key, path, -1, 1);
3048                        if (ret > 0 && skinny_metadata && path->slots[0]) {
3049                                /*
3050                                 * Couldn't find our skinny metadata item,
3051                                 * see if we have ye olde extent item.
3052                                 */
3053                                path->slots[0]--;
3054                                btrfs_item_key_to_cpu(path->nodes[0], &key,
3055                                                      path->slots[0]);
3056                                if (key.objectid == bytenr &&
3057                                    key.type == BTRFS_EXTENT_ITEM_KEY &&
3058                                    key.offset == num_bytes)
3059                                        ret = 0;
3060                        }
3061
3062                        if (ret > 0 && skinny_metadata) {
3063                                skinny_metadata = false;
3064                                key.objectid = bytenr;
3065                                key.type = BTRFS_EXTENT_ITEM_KEY;
3066                                key.offset = num_bytes;
3067                                btrfs_release_path(path);
3068                                ret = btrfs_search_slot(trans, extent_root,
3069                                                        &key, path, -1, 1);
3070                        }
3071
3072                        if (ret) {
3073                                btrfs_err(info,
3074                                          "umm, got %d back from search, was looking for %llu",
3075                                          ret, bytenr);
3076                                if (ret > 0)
3077                                        btrfs_print_leaf(path->nodes[0]);
3078                        }
3079                        if (ret < 0) {
3080                                btrfs_abort_transaction(trans, ret);
3081                                goto out;
3082                        }
3083                        extent_slot = path->slots[0];
3084                }
3085        } else if (WARN_ON(ret == -ENOENT)) {
3086                btrfs_print_leaf(path->nodes[0]);
3087                btrfs_err(info,
3088                        "unable to find ref byte nr %llu parent %llu root %llu  owner %llu offset %llu",
3089                        bytenr, parent, root_objectid, owner_objectid,
3090                        owner_offset);
3091                btrfs_abort_transaction(trans, ret);
3092                goto out;
3093        } else {
3094                btrfs_abort_transaction(trans, ret);
3095                goto out;
3096        }
3097
3098        leaf = path->nodes[0];
3099        item_size = btrfs_item_size(leaf, extent_slot);
3100        if (unlikely(item_size < sizeof(*ei))) {
3101                ret = -EINVAL;
3102                btrfs_print_v0_err(info);
3103                btrfs_abort_transaction(trans, ret);
3104                goto out;
3105        }
3106        ei = btrfs_item_ptr(leaf, extent_slot,
3107                            struct btrfs_extent_item);
3108        if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
3109            key.type == BTRFS_EXTENT_ITEM_KEY) {
3110                struct btrfs_tree_block_info *bi;
3111                if (item_size < sizeof(*ei) + sizeof(*bi)) {
3112                        btrfs_crit(info,
3113"invalid extent item size for key (%llu, %u, %llu) owner %llu, has %u expect >= %zu",
3114                                   key.objectid, key.type, key.offset,
3115                                   owner_objectid, item_size,
3116                                   sizeof(*ei) + sizeof(*bi));
3117                        btrfs_abort_transaction(trans, -EUCLEAN);
3118                        goto err_dump;
3119                }
3120                bi = (struct btrfs_tree_block_info *)(ei + 1);
3121                WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
3122        }
3123
3124        refs = btrfs_extent_refs(leaf, ei);
3125        if (refs < refs_to_drop) {
3126                btrfs_crit(info,
3127                "trying to drop %d refs but we only have %llu for bytenr %llu",
3128                          refs_to_drop, refs, bytenr);
3129                btrfs_abort_transaction(trans, -EUCLEAN);
3130                goto err_dump;
3131        }
3132        refs -= refs_to_drop;
3133
3134        if (refs > 0) {
3135                if (extent_op)
3136                        __run_delayed_extent_op(extent_op, leaf, ei);
3137                /*
3138                 * In the case of inline back ref, reference count will
3139                 * be updated by remove_extent_backref
3140                 */
3141                if (iref) {
3142                        if (!found_extent) {
3143                                btrfs_crit(info,
3144"invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found");
3145                                btrfs_abort_transaction(trans, -EUCLEAN);
3146                                goto err_dump;
3147                        }
3148                } else {
3149                        btrfs_set_extent_refs(leaf, ei, refs);
3150                        btrfs_mark_buffer_dirty(leaf);
3151                }
3152                if (found_extent) {
3153                        ret = remove_extent_backref(trans, extent_root, path,
3154                                                    iref, refs_to_drop, is_data);
3155                        if (ret) {
3156                                btrfs_abort_transaction(trans, ret);
3157                                goto out;
3158                        }
3159                }
3160        } else {
3161                /* In this branch refs == 1 */
3162                if (found_extent) {
3163                        if (is_data && refs_to_drop !=
3164                            extent_data_ref_count(path, iref)) {
3165                                btrfs_crit(info,
3166                "invalid refs_to_drop, current refs %u refs_to_drop %u",
3167                                           extent_data_ref_count(path, iref),
3168                                           refs_to_drop);
3169                                btrfs_abort_transaction(trans, -EUCLEAN);
3170                                goto err_dump;
3171                        }
3172                        if (iref) {
3173                                if (path->slots[0] != extent_slot) {
3174                                        btrfs_crit(info,
3175"invalid iref, extent item key (%llu %u %llu) doesn't have wanted iref",
3176                                                   key.objectid, key.type,
3177                                                   key.offset);
3178                                        btrfs_abort_transaction(trans, -EUCLEAN);
3179                                        goto err_dump;
3180                                }
3181                        } else {
3182                                /*
3183                                 * No inline ref, we must be at SHARED_* item,
3184                                 * And it's single ref, it must be:
3185                                 * |    extent_slot       ||extent_slot + 1|
3186                                 * [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ]
3187                                 */
3188                                if (path->slots[0] != extent_slot + 1) {
3189                                        btrfs_crit(info,
3190        "invalid SHARED_* item, previous item is not EXTENT/METADATA_ITEM");
3191                                        btrfs_abort_transaction(trans, -EUCLEAN);
3192                                        goto err_dump;
3193                                }
3194                                path->slots[0] = extent_slot;
3195                                num_to_del = 2;
3196                        }
3197                }
3198
3199                ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
3200                                      num_to_del);
3201                if (ret) {
3202                        btrfs_abort_transaction(trans, ret);
3203                        goto out;
3204                }
3205                btrfs_release_path(path);
3206
3207                ret = do_free_extent_accounting(trans, bytenr, num_bytes, is_data);
3208        }
3209        btrfs_release_path(path);
3210
3211out:
3212        btrfs_free_path(path);
3213        return ret;
3214err_dump:
3215        /*
3216         * Leaf dump can take up a lot of log buffer, so we only do full leaf
3217         * dump for debug build.
3218         */
3219        if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) {
3220                btrfs_crit(info, "path->slots[0]=%d extent_slot=%d",
3221                           path->slots[0], extent_slot);
3222                btrfs_print_leaf(path->nodes[0]);
3223        }
3224
3225        btrfs_free_path(path);
3226        return -EUCLEAN;
3227}
3228
3229/*
3230 * when we free an block, it is possible (and likely) that we free the last
3231 * delayed ref for that extent as well.  This searches the delayed ref tree for
3232 * a given extent, and if there are no other delayed refs to be processed, it
3233 * removes it from the tree.
3234 */
3235static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
3236                                      u64 bytenr)
3237{
3238        struct btrfs_delayed_ref_head *head;
3239        struct btrfs_delayed_ref_root *delayed_refs;
3240        int ret = 0;
3241
3242        delayed_refs = &trans->transaction->delayed_refs;
3243        spin_lock(&delayed_refs->lock);
3244        head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
3245        if (!head)
3246                goto out_delayed_unlock;
3247
3248        spin_lock(&head->lock);
3249        if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root))
3250                goto out;
3251
3252        if (cleanup_extent_op(head) != NULL)
3253                goto out;
3254
3255        /*
3256         * waiting for the lock here would deadlock.  If someone else has it
3257         * locked they are already in the process of dropping it anyway
3258         */
3259        if (!mutex_trylock(&head->mutex))
3260                goto out;
3261
3262        btrfs_delete_ref_head(delayed_refs, head);
3263        head->processing = 0;
3264
3265        spin_unlock(&head->lock);
3266        spin_unlock(&delayed_refs->lock);
3267
3268        BUG_ON(head->extent_op);
3269        if (head->must_insert_reserved)
3270                ret = 1;
3271
3272        btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head);
3273        mutex_unlock(&head->mutex);
3274        btrfs_put_delayed_ref_head(head);
3275        return ret;
3276out:
3277        spin_unlock(&head->lock);
3278
3279out_delayed_unlock:
3280        spin_unlock(&delayed_refs->lock);
3281        return 0;
3282}
3283
3284void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
3285                           u64 root_id,
3286                           struct extent_buffer *buf,
3287                           u64 parent, int last_ref)
3288{
3289        struct btrfs_fs_info *fs_info = trans->fs_info;
3290        struct btrfs_ref generic_ref = { 0 };
3291        int ret;
3292
3293        btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
3294                               buf->start, buf->len, parent);
3295        btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf),
3296                            root_id, 0, false);
3297
3298        if (root_id != BTRFS_TREE_LOG_OBJECTID) {
3299                btrfs_ref_tree_mod(fs_info, &generic_ref);
3300                ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL);
3301                BUG_ON(ret); /* -ENOMEM */
3302        }
3303
3304        if (last_ref && btrfs_header_generation(buf) == trans->transid) {
3305                struct btrfs_block_group *cache;
3306                bool must_pin = false;
3307
3308                if (root_id != BTRFS_TREE_LOG_OBJECTID) {
3309                        ret = check_ref_cleanup(trans, buf->start);
3310                        if (!ret) {
3311                                btrfs_redirty_list_add(trans->transaction, buf);
3312                                goto out;
3313                        }
3314                }
3315
3316                cache = btrfs_lookup_block_group(fs_info, buf->start);
3317
3318                if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
3319                        pin_down_extent(trans, cache, buf->start, buf->len, 1);
3320                        btrfs_put_block_group(cache);
3321                        goto out;
3322                }
3323
3324                /*
3325                 * If this is a leaf and there are tree mod log users, we may
3326                 * have recorded mod log operations that point to this leaf.
3327                 * So we must make sure no one reuses this leaf's extent before
3328                 * mod log operations are applied to a node, otherwise after
3329                 * rewinding a node using the mod log operations we get an
3330                 * inconsistent btree, as the leaf's extent may now be used as
3331                 * a node or leaf for another different btree.
3332                 * We are safe from races here because at this point no other
3333                 * node or root points to this extent buffer, so if after this
3334                 * check a new tree mod log user joins, it will not be able to
3335                 * find a node pointing to this leaf and record operations that
3336                 * point to this leaf.
3337                 */
3338                if (btrfs_header_level(buf) == 0 &&
3339                    test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
3340                        must_pin = true;
3341
3342                if (must_pin || btrfs_is_zoned(fs_info)) {
3343                        btrfs_redirty_list_add(trans->transaction, buf);
3344                        pin_down_extent(trans, cache, buf->start, buf->len, 1);
3345                        btrfs_put_block_group(cache);
3346                        goto out;
3347                }
3348
3349                WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
3350
3351                btrfs_add_free_space(cache, buf->start, buf->len);
3352                btrfs_free_reserved_bytes(cache, buf->len, 0);
3353                btrfs_put_block_group(cache);
3354                trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
3355        }
3356out:
3357        if (last_ref) {
3358                /*
3359                 * Deleting the buffer, clear the corrupt flag since it doesn't
3360                 * matter anymore.
3361                 */
3362                clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
3363        }
3364}
3365
3366/* Can return -ENOMEM */
3367int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
3368{
3369        struct btrfs_fs_info *fs_info = trans->fs_info;
3370        int ret;
3371
3372        if (btrfs_is_testing(fs_info))
3373                return 0;
3374
3375        /*
3376         * tree log blocks never actually go into the extent allocation
3377         * tree, just update pinning info and exit early.
3378         */
3379        if ((ref->type == BTRFS_REF_METADATA &&
3380             ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) ||
3381            (ref->type == BTRFS_REF_DATA &&
3382             ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID)) {
3383                /* unlocks the pinned mutex */
3384                btrfs_pin_extent(trans, ref->bytenr, ref->len, 1);
3385                ret = 0;
3386        } else if (ref->type == BTRFS_REF_METADATA) {
3387                ret = btrfs_add_delayed_tree_ref(trans, ref, NULL);
3388        } else {
3389                ret = btrfs_add_delayed_data_ref(trans, ref, 0);
3390        }
3391
3392        if (!((ref->type == BTRFS_REF_METADATA &&
3393               ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) ||
3394              (ref->type == BTRFS_REF_DATA &&
3395               ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID)))
3396                btrfs_ref_tree_mod(fs_info, ref);
3397
3398        return ret;
3399}
3400
3401enum btrfs_loop_type {
3402        LOOP_CACHING_NOWAIT,
3403        LOOP_CACHING_WAIT,
3404        LOOP_ALLOC_CHUNK,
3405        LOOP_NO_EMPTY_SIZE,
3406};
3407
3408static inline void
3409btrfs_lock_block_group(struct btrfs_block_group *cache,
3410                       int delalloc)
3411{
3412        if (delalloc)
3413                down_read(&cache->data_rwsem);
3414}
3415
3416static inline void btrfs_grab_block_group(struct btrfs_block_group *cache,
3417                       int delalloc)
3418{
3419        btrfs_get_block_group(cache);
3420        if (delalloc)
3421                down_read(&cache->data_rwsem);
3422}
3423
3424static struct btrfs_block_group *btrfs_lock_cluster(
3425                   struct btrfs_block_group *block_group,
3426                   struct btrfs_free_cluster *cluster,
3427                   int delalloc)
3428        __acquires(&cluster->refill_lock)
3429{
3430        struct btrfs_block_group *used_bg = NULL;
3431
3432        spin_lock(&cluster->refill_lock);
3433        while (1) {
3434                used_bg = cluster->block_group;
3435                if (!used_bg)
3436                        return NULL;
3437
3438                if (used_bg == block_group)
3439                        return used_bg;
3440
3441                btrfs_get_block_group(used_bg);
3442
3443                if (!delalloc)
3444                        return used_bg;
3445
3446                if (down_read_trylock(&used_bg->data_rwsem))
3447                        return used_bg;
3448
3449                spin_unlock(&cluster->refill_lock);
3450
3451                /* We should only have one-level nested. */
3452                down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING);
3453
3454                spin_lock(&cluster->refill_lock);
3455                if (used_bg == cluster->block_group)
3456                        return used_bg;
3457
3458                up_read(&used_bg->data_rwsem);
3459                btrfs_put_block_group(used_bg);
3460        }
3461}
3462
3463static inline void
3464btrfs_release_block_group(struct btrfs_block_group *cache,
3465                         int delalloc)
3466{
3467        if (delalloc)
3468                up_read(&cache->data_rwsem);
3469        btrfs_put_block_group(cache);
3470}
3471
3472enum btrfs_extent_allocation_policy {
3473        BTRFS_EXTENT_ALLOC_CLUSTERED,
3474        BTRFS_EXTENT_ALLOC_ZONED,
3475};
3476
3477/*
3478 * Structure used internally for find_free_extent() function.  Wraps needed
3479 * parameters.
3480 */
3481struct find_free_extent_ctl {
3482        /* Basic allocation info */
3483        u64 ram_bytes;
3484        u64 num_bytes;
3485        u64 min_alloc_size;
3486        u64 empty_size;
3487        u64 flags;
3488        int delalloc;
3489
3490        /* Where to start the search inside the bg */
3491        u64 search_start;
3492
3493        /* For clustered allocation */
3494        u64 empty_cluster;
3495        struct btrfs_free_cluster *last_ptr;
3496        bool use_cluster;
3497
3498        bool have_caching_bg;
3499        bool orig_have_caching_bg;
3500
3501        /* Allocation is called for tree-log */
3502        bool for_treelog;
3503
3504        /* Allocation is called for data relocation */
3505        bool for_data_reloc;
3506
3507        /* RAID index, converted from flags */
3508        int index;
3509
3510        /*
3511         * Current loop number, check find_free_extent_update_loop() for details
3512         */
3513        int loop;
3514
3515        /*
3516         * Whether we're refilling a cluster, if true we need to re-search
3517         * current block group but don't try to refill the cluster again.
3518         */
3519        bool retry_clustered;
3520
3521        /*
3522         * Whether we're updating free space cache, if true we need to re-search
3523         * current block group but don't try updating free space cache again.
3524         */
3525        bool retry_unclustered;
3526
3527        /* If current block group is cached */
3528        int cached;
3529
3530        /* Max contiguous hole found */
3531        u64 max_extent_size;
3532
3533        /* Total free space from free space cache, not always contiguous */
3534        u64 total_free_space;
3535
3536        /* Found result */
3537        u64 found_offset;
3538
3539        /* Hint where to start looking for an empty space */
3540        u64 hint_byte;
3541
3542        /* Allocation policy */
3543        enum btrfs_extent_allocation_policy policy;
3544};
3545
3546
3547/*
3548 * Helper function for find_free_extent().
3549 *
3550 * Return -ENOENT to inform caller that we need fallback to unclustered mode.
3551 * Return -EAGAIN to inform caller that we need to re-search this block group
3552 * Return >0 to inform caller that we find nothing
3553 * Return 0 means we have found a location and set ffe_ctl->found_offset.
3554 */
3555static int find_free_extent_clustered(struct btrfs_block_group *bg,
3556                                      struct find_free_extent_ctl *ffe_ctl,
3557                                      struct btrfs_block_group **cluster_bg_ret)
3558{
3559        struct btrfs_block_group *cluster_bg;
3560        struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
3561        u64 aligned_cluster;
3562        u64 offset;
3563        int ret;
3564
3565        cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc);
3566        if (!cluster_bg)
3567                goto refill_cluster;
3568        if (cluster_bg != bg && (cluster_bg->ro ||
3569            !block_group_bits(cluster_bg, ffe_ctl->flags)))
3570                goto release_cluster;
3571
3572        offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
3573                        ffe_ctl->num_bytes, cluster_bg->start,
3574                        &ffe_ctl->max_extent_size);
3575        if (offset) {
3576                /* We have a block, we're done */
3577                spin_unlock(&last_ptr->refill_lock);
3578                trace_btrfs_reserve_extent_cluster(cluster_bg,
3579                                ffe_ctl->search_start, ffe_ctl->num_bytes);
3580                *cluster_bg_ret = cluster_bg;
3581                ffe_ctl->found_offset = offset;
3582                return 0;
3583        }
3584        WARN_ON(last_ptr->block_group != cluster_bg);
3585
3586release_cluster:
3587        /*
3588         * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so
3589         * lets just skip it and let the allocator find whatever block it can
3590         * find. If we reach this point, we will have tried the cluster
3591         * allocator plenty of times and not have found anything, so we are
3592         * likely way too fragmented for the clustering stuff to find anything.
3593         *
3594         * However, if the cluster is taken from the current block group,
3595         * release the cluster first, so that we stand a better chance of
3596         * succeeding in the unclustered allocation.
3597         */
3598        if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) {
3599                spin_unlock(&last_ptr->refill_lock);
3600                btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
3601                return -ENOENT;
3602        }
3603
3604        /* This cluster didn't work out, free it and start over */
3605        btrfs_return_cluster_to_free_space(NULL, last_ptr);
3606
3607        if (cluster_bg != bg)
3608                btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
3609
3610refill_cluster:
3611        if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) {
3612                spin_unlock(&last_ptr->refill_lock);
3613                return -ENOENT;
3614        }
3615
3616        aligned_cluster = max_t(u64,
3617                        ffe_ctl->empty_cluster + ffe_ctl->empty_size,
3618                        bg->full_stripe_len);
3619        ret = btrfs_find_space_cluster(bg, last_ptr, ffe_ctl->search_start,
3620                        ffe_ctl->num_bytes, aligned_cluster);
3621        if (ret == 0) {
3622                /* Now pull our allocation out of this cluster */
3623                offset = btrfs_alloc_from_cluster(bg, last_ptr,
3624                                ffe_ctl->num_bytes, ffe_ctl->search_start,
3625                                &ffe_ctl->max_extent_size);
3626                if (offset) {
3627                        /* We found one, proceed */
3628                        spin_unlock(&last_ptr->refill_lock);
3629                        trace_btrfs_reserve_extent_cluster(bg,
3630                                        ffe_ctl->search_start,
3631                                        ffe_ctl->num_bytes);
3632                        ffe_ctl->found_offset = offset;
3633                        return 0;
3634                }
3635        } else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
3636                   !ffe_ctl->retry_clustered) {
3637                spin_unlock(&last_ptr->refill_lock);
3638
3639                ffe_ctl->retry_clustered = true;
3640                btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
3641                                ffe_ctl->empty_cluster + ffe_ctl->empty_size);
3642                return -EAGAIN;
3643        }
3644        /*
3645         * At this point we either didn't find a cluster or we weren't able to
3646         * allocate a block from our cluster.  Free the cluster we've been
3647         * trying to use, and go to the next block group.
3648         */
3649        btrfs_return_cluster_to_free_space(NULL, last_ptr);
3650        spin_unlock(&last_ptr->refill_lock);
3651        return 1;
3652}
3653
3654/*
3655 * Return >0 to inform caller that we find nothing
3656 * Return 0 when we found an free extent and set ffe_ctrl->found_offset
3657 * Return -EAGAIN to inform caller that we need to re-search this block group
3658 */
3659static int find_free_extent_unclustered(struct btrfs_block_group *bg,
3660                                        struct find_free_extent_ctl *ffe_ctl)
3661{
3662        struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
3663        u64 offset;
3664
3665        /*
3666         * We are doing an unclustered allocation, set the fragmented flag so
3667         * we don't bother trying to setup a cluster again until we get more
3668         * space.
3669         */
3670        if (unlikely(last_ptr)) {
3671                spin_lock(&last_ptr->lock);
3672                last_ptr->fragmented = 1;
3673                spin_unlock(&last_ptr->lock);
3674        }
3675        if (ffe_ctl->cached) {
3676                struct btrfs_free_space_ctl *free_space_ctl;
3677
3678                free_space_ctl = bg->free_space_ctl;
3679                spin_lock(&free_space_ctl->tree_lock);
3680                if (free_space_ctl->free_space <
3681                    ffe_ctl->num_bytes + ffe_ctl->empty_cluster +
3682                    ffe_ctl->empty_size) {
3683                        ffe_ctl->total_free_space = max_t(u64,
3684                                        ffe_ctl->total_free_space,
3685                                        free_space_ctl->free_space);
3686                        spin_unlock(&free_space_ctl->tree_lock);
3687                        return 1;
3688                }
3689                spin_unlock(&free_space_ctl->tree_lock);
3690        }
3691
3692        offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start,
3693                        ffe_ctl->num_bytes, ffe_ctl->empty_size,
3694                        &ffe_ctl->max_extent_size);
3695
3696        /*
3697         * If we didn't find a chunk, and we haven't failed on this block group
3698         * before, and this block group is in the middle of caching and we are
3699         * ok with waiting, then go ahead and wait for progress to be made, and
3700         * set @retry_unclustered to true.
3701         *
3702         * If @retry_unclustered is true then we've already waited on this
3703         * block group once and should move on to the next block group.
3704         */
3705        if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached &&
3706            ffe_ctl->loop > LOOP_CACHING_NOWAIT) {
3707                btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
3708                                                      ffe_ctl->empty_size);
3709                ffe_ctl->retry_unclustered = true;
3710                return -EAGAIN;
3711        } else if (!offset) {
3712                return 1;
3713        }
3714        ffe_ctl->found_offset = offset;
3715        return 0;
3716}
3717
3718static int do_allocation_clustered(struct btrfs_block_group *block_group,
3719                                   struct find_free_extent_ctl *ffe_ctl,
3720                                   struct btrfs_block_group **bg_ret)
3721{
3722        int ret;
3723
3724        /* We want to try and use the cluster allocator, so lets look there */
3725        if (ffe_ctl->last_ptr && ffe_ctl->use_cluster) {
3726                ret = find_free_extent_clustered(block_group, ffe_ctl, bg_ret);
3727                if (ret >= 0 || ret == -EAGAIN)
3728                        return ret;
3729                /* ret == -ENOENT case falls through */
3730        }
3731
3732        return find_free_extent_unclustered(block_group, ffe_ctl);
3733}
3734
3735/*
3736 * Tree-log block group locking
3737 * ============================
3738 *
3739 * fs_info::treelog_bg_lock protects the fs_info::treelog_bg which
3740 * indicates the starting address of a block group, which is reserved only
3741 * for tree-log metadata.
3742 *
3743 * Lock nesting
3744 * ============
3745 *
3746 * space_info::lock
3747 *   block_group::lock
3748 *     fs_info::treelog_bg_lock
3749 */
3750
3751/*
3752 * Simple allocator for sequential-only block group. It only allows sequential
3753 * allocation. No need to play with trees. This function also reserves the
3754 * bytes as in btrfs_add_reserved_bytes.
3755 */
3756static int do_allocation_zoned(struct btrfs_block_group *block_group,
3757                               struct find_free_extent_ctl *ffe_ctl,
3758                               struct btrfs_block_group **bg_ret)
3759{
3760        struct btrfs_fs_info *fs_info = block_group->fs_info;
3761        struct btrfs_space_info *space_info = block_group->space_info;
3762        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3763        u64 start = block_group->start;
3764        u64 num_bytes = ffe_ctl->num_bytes;
3765        u64 avail;
3766        u64 bytenr = block_group->start;
3767        u64 log_bytenr;
3768        u64 data_reloc_bytenr;
3769        int ret = 0;
3770        bool skip = false;
3771
3772        ASSERT(btrfs_is_zoned(block_group->fs_info));
3773
3774        /*
3775         * Do not allow non-tree-log blocks in the dedicated tree-log block
3776         * group, and vice versa.
3777         */
3778        spin_lock(&fs_info->treelog_bg_lock);
3779        log_bytenr = fs_info->treelog_bg;
3780        if (log_bytenr && ((ffe_ctl->for_treelog && bytenr != log_bytenr) ||
3781                           (!ffe_ctl->for_treelog && bytenr == log_bytenr)))
3782                skip = true;
3783        spin_unlock(&fs_info->treelog_bg_lock);
3784        if (skip)
3785                return 1;
3786
3787        /*
3788         * Do not allow non-relocation blocks in the dedicated relocation block
3789         * group, and vice versa.
3790         */
3791        spin_lock(&fs_info->relocation_bg_lock);
3792        data_reloc_bytenr = fs_info->data_reloc_bg;
3793        if (data_reloc_bytenr &&
3794            ((ffe_ctl->for_data_reloc && bytenr != data_reloc_bytenr) ||
3795             (!ffe_ctl->for_data_reloc && bytenr == data_reloc_bytenr)))
3796                skip = true;
3797        spin_unlock(&fs_info->relocation_bg_lock);
3798        if (skip)
3799                return 1;
3800
3801        /* Check RO and no space case before trying to activate it */
3802        spin_lock(&block_group->lock);
3803        if (block_group->ro || btrfs_zoned_bg_is_full(block_group)) {
3804                ret = 1;
3805                /*
3806                 * May need to clear fs_info->{treelog,data_reloc}_bg.
3807                 * Return the error after taking the locks.
3808                 */
3809        }
3810        spin_unlock(&block_group->lock);
3811
3812        if (!ret && !btrfs_zone_activate(block_group)) {
3813                ret = 1;
3814                /*
3815                 * May need to clear fs_info->{treelog,data_reloc}_bg.
3816                 * Return the error after taking the locks.
3817                 */
3818        }
3819
3820        spin_lock(&space_info->lock);
3821        spin_lock(&block_group->lock);
3822        spin_lock(&fs_info->treelog_bg_lock);
3823        spin_lock(&fs_info->relocation_bg_lock);
3824
3825        if (ret)
3826                goto out;
3827
3828        ASSERT(!ffe_ctl->for_treelog ||
3829               block_group->start == fs_info->treelog_bg ||
3830               fs_info->treelog_bg == 0);
3831        ASSERT(!ffe_ctl->for_data_reloc ||
3832               block_group->start == fs_info->data_reloc_bg ||
3833               fs_info->data_reloc_bg == 0);
3834
3835        if (block_group->ro || block_group->zoned_data_reloc_ongoing) {
3836                ret = 1;
3837                goto out;
3838        }
3839
3840        /*
3841         * Do not allow currently using block group to be tree-log dedicated
3842         * block group.
3843         */
3844        if (ffe_ctl->for_treelog && !fs_info->treelog_bg &&
3845            (block_group->used || block_group->reserved)) {
3846                ret = 1;
3847                goto out;
3848        }
3849
3850        /*
3851         * Do not allow currently used block group to be the data relocation
3852         * dedicated block group.
3853         */
3854        if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg &&
3855            (block_group->used || block_group->reserved)) {
3856                ret = 1;
3857                goto out;
3858        }
3859
3860        WARN_ON_ONCE(block_group->alloc_offset > block_group->zone_capacity);
3861        avail = block_group->zone_capacity - block_group->alloc_offset;
3862        if (avail < num_bytes) {
3863                if (ffe_ctl->max_extent_size < avail) {
3864                        /*
3865                         * With sequential allocator, free space is always
3866                         * contiguous
3867                         */
3868                        ffe_ctl->max_extent_size = avail;
3869                        ffe_ctl->total_free_space = avail;
3870                }
3871                ret = 1;
3872                goto out;
3873        }
3874
3875        if (ffe_ctl->for_treelog && !fs_info->treelog_bg)
3876                fs_info->treelog_bg = block_group->start;
3877
3878        if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg)
3879                fs_info->data_reloc_bg = block_group->start;
3880
3881        ffe_ctl->found_offset = start + block_group->alloc_offset;
3882        block_group->alloc_offset += num_bytes;
3883        spin_lock(&ctl->tree_lock);
3884        ctl->free_space -= num_bytes;
3885        spin_unlock(&ctl->tree_lock);
3886
3887        /*
3888         * We do not check if found_offset is aligned to stripesize. The
3889         * address is anyway rewritten when using zone append writing.
3890         */
3891
3892        ffe_ctl->search_start = ffe_ctl->found_offset;
3893
3894out:
3895        if (ret && ffe_ctl->for_treelog)
3896                fs_info->treelog_bg = 0;
3897        if (ret && ffe_ctl->for_data_reloc &&
3898            fs_info->data_reloc_bg == block_group->start) {
3899                /*
3900                 * Do not allow further allocations from this block group.
3901                 * Compared to increasing the ->ro, setting the
3902                 * ->zoned_data_reloc_ongoing flag still allows nocow
3903                 *  writers to come in. See btrfs_inc_nocow_writers().
3904                 *
3905                 * We need to disable an allocation to avoid an allocation of
3906                 * regular (non-relocation data) extent. With mix of relocation
3907                 * extents and regular extents, we can dispatch WRITE commands
3908                 * (for relocation extents) and ZONE APPEND commands (for
3909                 * regular extents) at the same time to the same zone, which
3910                 * easily break the write pointer.
3911                 */
3912                block_group->zoned_data_reloc_ongoing = 1;
3913                fs_info->data_reloc_bg = 0;
3914        }
3915        spin_unlock(&fs_info->relocation_bg_lock);
3916        spin_unlock(&fs_info->treelog_bg_lock);
3917        spin_unlock(&block_group->lock);
3918        spin_unlock(&space_info->lock);
3919        return ret;
3920}
3921
3922static int do_allocation(struct btrfs_block_group *block_group,
3923                         struct find_free_extent_ctl *ffe_ctl,
3924                         struct btrfs_block_group **bg_ret)
3925{
3926        switch (ffe_ctl->policy) {
3927        case BTRFS_EXTENT_ALLOC_CLUSTERED:
3928                return do_allocation_clustered(block_group, ffe_ctl, bg_ret);
3929        case BTRFS_EXTENT_ALLOC_ZONED:
3930                return do_allocation_zoned(block_group, ffe_ctl, bg_ret);
3931        default:
3932                BUG();
3933        }
3934}
3935
3936static void release_block_group(struct btrfs_block_group *block_group,
3937                                struct find_free_extent_ctl *ffe_ctl,
3938                                int delalloc)
3939{
3940        switch (ffe_ctl->policy) {
3941        case BTRFS_EXTENT_ALLOC_CLUSTERED:
3942                ffe_ctl->retry_clustered = false;
3943                ffe_ctl->retry_unclustered = false;
3944                break;
3945        case BTRFS_EXTENT_ALLOC_ZONED:
3946                /* Nothing to do */
3947                break;
3948        default:
3949                BUG();
3950        }
3951
3952        BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
3953               ffe_ctl->index);
3954        btrfs_release_block_group(block_group, delalloc);
3955}
3956
3957static void found_extent_clustered(struct find_free_extent_ctl *ffe_ctl,
3958                                   struct btrfs_key *ins)
3959{
3960        struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
3961
3962        if (!ffe_ctl->use_cluster && last_ptr) {
3963                spin_lock(&last_ptr->lock);
3964                last_ptr->window_start = ins->objectid;
3965                spin_unlock(&last_ptr->lock);
3966        }
3967}
3968
3969static void found_extent(struct find_free_extent_ctl *ffe_ctl,
3970                         struct btrfs_key *ins)
3971{
3972        switch (ffe_ctl->policy) {
3973        case BTRFS_EXTENT_ALLOC_CLUSTERED:
3974                found_extent_clustered(ffe_ctl, ins);
3975                break;
3976        case BTRFS_EXTENT_ALLOC_ZONED:
3977                /* Nothing to do */
3978                break;
3979        default:
3980                BUG();
3981        }
3982}
3983
3984static bool can_allocate_chunk(struct btrfs_fs_info *fs_info,
3985                               struct find_free_extent_ctl *ffe_ctl)
3986{
3987        switch (ffe_ctl->policy) {
3988        case BTRFS_EXTENT_ALLOC_CLUSTERED:
3989                return true;
3990        case BTRFS_EXTENT_ALLOC_ZONED:
3991                /*
3992                 * If we have enough free space left in an already
3993                 * active block group and we can't activate any other
3994                 * zone now, do not allow allocating a new chunk and
3995                 * let find_free_extent() retry with a smaller size.
3996                 */
3997                if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size &&
3998                    !btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags))
3999                        return false;
4000                return true;
4001        default:
4002                BUG();
4003        }
4004}
4005
4006static int chunk_allocation_failed(struct find_free_extent_ctl *ffe_ctl)
4007{
4008        switch (ffe_ctl->policy) {
4009        case BTRFS_EXTENT_ALLOC_CLUSTERED:
4010                /*
4011                 * If we can't allocate a new chunk we've already looped through
4012                 * at least once, move on to the NO_EMPTY_SIZE case.
4013                 */
4014                ffe_ctl->loop = LOOP_NO_EMPTY_SIZE;
4015                return 0;
4016        case BTRFS_EXTENT_ALLOC_ZONED:
4017                /* Give up here */
4018                return -ENOSPC;
4019        default:
4020                BUG();
4021        }
4022}
4023
4024/*
4025 * Return >0 means caller needs to re-search for free extent
4026 * Return 0 means we have the needed free extent.
4027 * Return <0 means we failed to locate any free extent.
4028 */
4029static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
4030                                        struct btrfs_key *ins,
4031                                        struct find_free_extent_ctl *ffe_ctl,
4032                                        bool full_search)
4033{
4034        struct btrfs_root *root = fs_info->chunk_root;
4035        int ret;
4036
4037        if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) &&
4038            ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg)
4039                ffe_ctl->orig_have_caching_bg = true;
4040
4041        if (ins->objectid) {
4042                found_extent(ffe_ctl, ins);
4043                return 0;
4044        }
4045
4046        if (ffe_ctl->loop >= LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg)
4047                return 1;
4048
4049        ffe_ctl->index++;
4050        if (ffe_ctl->index < BTRFS_NR_RAID_TYPES)
4051                return 1;
4052
4053        /*
4054         * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
4055         *                      caching kthreads as we move along
4056         * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
4057         * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
4058         * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
4059         *                     again
4060         */
4061        if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) {
4062                ffe_ctl->index = 0;
4063                if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) {
4064                        /*
4065                         * We want to skip the LOOP_CACHING_WAIT step if we
4066                         * don't have any uncached bgs and we've already done a
4067                         * full search through.
4068                         */
4069                        if (ffe_ctl->orig_have_caching_bg || !full_search)
4070                                ffe_ctl->loop = LOOP_CACHING_WAIT;
4071                        else
4072                                ffe_ctl->loop = LOOP_ALLOC_CHUNK;
4073                } else {
4074                        ffe_ctl->loop++;
4075                }
4076
4077                if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) {
4078                        struct btrfs_trans_handle *trans;
4079                        int exist = 0;
4080
4081                        /*Check if allocation policy allows to create a new chunk */
4082                        if (!can_allocate_chunk(fs_info, ffe_ctl))
4083                                return -ENOSPC;
4084
4085                        trans = current->journal_info;
4086                        if (trans)
4087                                exist = 1;
4088                        else
4089                                trans = btrfs_join_transaction(root);
4090
4091                        if (IS_ERR(trans)) {
4092                                ret = PTR_ERR(trans);
4093                                return ret;
4094                        }
4095
4096                        ret = btrfs_chunk_alloc(trans, ffe_ctl->flags,
4097                                                CHUNK_ALLOC_FORCE_FOR_EXTENT);
4098
4099                        /* Do not bail out on ENOSPC since we can do more. */
4100                        if (ret == -ENOSPC)
4101                                ret = chunk_allocation_failed(ffe_ctl);
4102                        else if (ret < 0)
4103                                btrfs_abort_transaction(trans, ret);
4104                        else
4105                                ret = 0;
4106                        if (!exist)
4107                                btrfs_end_transaction(trans);
4108                        if (ret)
4109                                return ret;
4110                }
4111
4112                if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) {
4113                        if (ffe_ctl->policy != BTRFS_EXTENT_ALLOC_CLUSTERED)
4114                                return -ENOSPC;
4115
4116                        /*
4117                         * Don't loop again if we already have no empty_size and
4118                         * no empty_cluster.
4119                         */
4120                        if (ffe_ctl->empty_size == 0 &&
4121                            ffe_ctl->empty_cluster == 0)
4122                                return -ENOSPC;
4123                        ffe_ctl->empty_size = 0;
4124                        ffe_ctl->empty_cluster = 0;
4125                }
4126                return 1;
4127        }
4128        return -ENOSPC;
4129}
4130
4131static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
4132                                        struct find_free_extent_ctl *ffe_ctl,
4133                                        struct btrfs_space_info *space_info,
4134                                        struct btrfs_key *ins)
4135{
4136        /*
4137         * If our free space is heavily fragmented we may not be able to make
4138         * big contiguous allocations, so instead of doing the expensive search
4139         * for free space, simply return ENOSPC with our max_extent_size so we
4140         * can go ahead and search for a more manageable chunk.
4141         *
4142         * If our max_extent_size is large enough for our allocation simply
4143         * disable clustering since we will likely not be able to find enough
4144         * space to create a cluster and induce latency trying.
4145         */
4146        if (space_info->max_extent_size) {
4147                spin_lock(&space_info->lock);
4148                if (space_info->max_extent_size &&
4149                    ffe_ctl->num_bytes > space_info->max_extent_size) {
4150                        ins->offset = space_info->max_extent_size;
4151                        spin_unlock(&space_info->lock);
4152                        return -ENOSPC;
4153                } else if (space_info->max_extent_size) {
4154                        ffe_ctl->use_cluster = false;
4155                }
4156                spin_unlock(&space_info->lock);
4157        }
4158
4159        ffe_ctl->last_ptr = fetch_cluster_info(fs_info, space_info,
4160                                               &ffe_ctl->empty_cluster);
4161        if (ffe_ctl->last_ptr) {
4162                struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
4163
4164                spin_lock(&last_ptr->lock);
4165                if (last_ptr->block_group)
4166                        ffe_ctl->hint_byte = last_ptr->window_start;
4167                if (last_ptr->fragmented) {
4168                        /*
4169                         * We still set window_start so we can keep track of the
4170                         * last place we found an allocation to try and save
4171                         * some time.
4172                         */
4173                        ffe_ctl->hint_byte = last_ptr->window_start;
4174                        ffe_ctl->use_cluster = false;
4175                }
4176                spin_unlock(&last_ptr->lock);
4177        }
4178
4179        return 0;
4180}
4181
4182static int prepare_allocation(struct btrfs_fs_info *fs_info,
4183                              struct find_free_extent_ctl *ffe_ctl,
4184                              struct btrfs_space_info *space_info,
4185                              struct btrfs_key *ins)
4186{
4187        switch (ffe_ctl->policy) {
4188        case BTRFS_EXTENT_ALLOC_CLUSTERED:
4189                return prepare_allocation_clustered(fs_info, ffe_ctl,
4190                                                    space_info, ins);
4191        case BTRFS_EXTENT_ALLOC_ZONED:
4192                if (ffe_ctl->for_treelog) {
4193                        spin_lock(&fs_info->treelog_bg_lock);
4194                        if (fs_info->treelog_bg)
4195                                ffe_ctl->hint_byte = fs_info->treelog_bg;
4196                        spin_unlock(&fs_info->treelog_bg_lock);
4197                }
4198                if (ffe_ctl->for_data_reloc) {
4199                        spin_lock(&fs_info->relocation_bg_lock);
4200                        if (fs_info->data_reloc_bg)
4201                                ffe_ctl->hint_byte = fs_info->data_reloc_bg;
4202                        spin_unlock(&fs_info->relocation_bg_lock);
4203                }
4204                return 0;
4205        default:
4206                BUG();
4207        }
4208}
4209
4210/*
4211 * walks the btree of allocated extents and find a hole of a given size.
4212 * The key ins is changed to record the hole:
4213 * ins->objectid == start position
4214 * ins->flags = BTRFS_EXTENT_ITEM_KEY
4215 * ins->offset == the size of the hole.
4216 * Any available blocks before search_start are skipped.
4217 *
4218 * If there is no suitable free space, we will record the max size of
4219 * the free space extent currently.
4220 *
4221 * The overall logic and call chain:
4222 *
4223 * find_free_extent()
4224 * |- Iterate through all block groups
4225 * |  |- Get a valid block group
4226 * |  |- Try to do clustered allocation in that block group
4227 * |  |- Try to do unclustered allocation in that block group
4228 * |  |- Check if the result is valid
4229 * |  |  |- If valid, then exit
4230 * |  |- Jump to next block group
4231 * |
4232 * |- Push harder to find free extents
4233 *    |- If not found, re-iterate all block groups
4234 */
4235static noinline int find_free_extent(struct btrfs_root *root,
4236                                     struct btrfs_key *ins,
4237                                     struct find_free_extent_ctl *ffe_ctl)
4238{
4239        struct btrfs_fs_info *fs_info = root->fs_info;
4240        int ret = 0;
4241        int cache_block_group_error = 0;
4242        struct btrfs_block_group *block_group = NULL;
4243        struct btrfs_space_info *space_info;
4244        bool full_search = false;
4245
4246        WARN_ON(ffe_ctl->num_bytes < fs_info->sectorsize);
4247
4248        ffe_ctl->search_start = 0;
4249        /* For clustered allocation */
4250        ffe_ctl->empty_cluster = 0;
4251        ffe_ctl->last_ptr = NULL;
4252        ffe_ctl->use_cluster = true;
4253        ffe_ctl->have_caching_bg = false;
4254        ffe_ctl->orig_have_caching_bg = false;
4255        ffe_ctl->index = btrfs_bg_flags_to_raid_index(ffe_ctl->flags);
4256        ffe_ctl->loop = 0;
4257        /* For clustered allocation */
4258        ffe_ctl->retry_clustered = false;
4259        ffe_ctl->retry_unclustered = false;
4260        ffe_ctl->cached = 0;
4261        ffe_ctl->max_extent_size = 0;
4262        ffe_ctl->total_free_space = 0;
4263        ffe_ctl->found_offset = 0;
4264        ffe_ctl->policy = BTRFS_EXTENT_ALLOC_CLUSTERED;
4265
4266        if (btrfs_is_zoned(fs_info))
4267                ffe_ctl->policy = BTRFS_EXTENT_ALLOC_ZONED;
4268
4269        ins->type = BTRFS_EXTENT_ITEM_KEY;
4270        ins->objectid = 0;
4271        ins->offset = 0;
4272
4273        trace_find_free_extent(root, ffe_ctl->num_bytes, ffe_ctl->empty_size,
4274                               ffe_ctl->flags);
4275
4276        space_info = btrfs_find_space_info(fs_info, ffe_ctl->flags);
4277        if (!space_info) {
4278                btrfs_err(fs_info, "No space info for %llu", ffe_ctl->flags);
4279                return -ENOSPC;
4280        }
4281
4282        ret = prepare_allocation(fs_info, ffe_ctl, space_info, ins);
4283        if (ret < 0)
4284                return ret;
4285
4286        ffe_ctl->search_start = max(ffe_ctl->search_start,
4287                                    first_logical_byte(fs_info));
4288        ffe_ctl->search_start = max(ffe_ctl->search_start, ffe_ctl->hint_byte);
4289        if (ffe_ctl->search_start == ffe_ctl->hint_byte) {
4290                block_group = btrfs_lookup_block_group(fs_info,
4291                                                       ffe_ctl->search_start);
4292                /*
4293                 * we don't want to use the block group if it doesn't match our
4294                 * allocation bits, or if its not cached.
4295                 *
4296                 * However if we are re-searching with an ideal block group
4297                 * picked out then we don't care that the block group is cached.
4298                 */
4299                if (block_group && block_group_bits(block_group, ffe_ctl->flags) &&
4300                    block_group->cached != BTRFS_CACHE_NO) {
4301                        down_read(&space_info->groups_sem);
4302                        if (list_empty(&block_group->list) ||
4303                            block_group->ro) {
4304                                /*
4305                                 * someone is removing this block group,
4306                                 * we can't jump into the have_block_group
4307                                 * target because our list pointers are not
4308                                 * valid
4309                                 */
4310                                btrfs_put_block_group(block_group);
4311                                up_read(&space_info->groups_sem);
4312                        } else {
4313                                ffe_ctl->index = btrfs_bg_flags_to_raid_index(
4314                                                        block_group->flags);
4315                                btrfs_lock_block_group(block_group,
4316                                                       ffe_ctl->delalloc);
4317                                goto have_block_group;
4318                        }
4319                } else if (block_group) {
4320                        btrfs_put_block_group(block_group);
4321                }
4322        }
4323search:
4324        ffe_ctl->have_caching_bg = false;
4325        if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) ||
4326            ffe_ctl->index == 0)
4327                full_search = true;
4328        down_read(&space_info->groups_sem);
4329        list_for_each_entry(block_group,
4330                            &space_info->block_groups[ffe_ctl->index], list) {
4331                struct btrfs_block_group *bg_ret;
4332
4333                /* If the block group is read-only, we can skip it entirely. */
4334                if (unlikely(block_group->ro)) {
4335                        if (ffe_ctl->for_treelog)
4336                                btrfs_clear_treelog_bg(block_group);
4337                        if (ffe_ctl->for_data_reloc)
4338                                btrfs_clear_data_reloc_bg(block_group);
4339                        continue;
4340                }
4341
4342                btrfs_grab_block_group(block_group, ffe_ctl->delalloc);
4343                ffe_ctl->search_start = block_group->start;
4344
4345                /*
4346                 * this can happen if we end up cycling through all the
4347                 * raid types, but we want to make sure we only allocate
4348                 * for the proper type.
4349                 */
4350                if (!block_group_bits(block_group, ffe_ctl->flags)) {
4351                        u64 extra = BTRFS_BLOCK_GROUP_DUP |
4352                                BTRFS_BLOCK_GROUP_RAID1_MASK |
4353                                BTRFS_BLOCK_GROUP_RAID56_MASK |
4354                                BTRFS_BLOCK_GROUP_RAID10;
4355
4356                        /*
4357                         * if they asked for extra copies and this block group
4358                         * doesn't provide them, bail.  This does allow us to
4359                         * fill raid0 from raid1.
4360                         */
4361                        if ((ffe_ctl->flags & extra) && !(block_group->flags & extra))
4362                                goto loop;
4363
4364                        /*
4365                         * This block group has different flags than we want.
4366                         * It's possible that we have MIXED_GROUP flag but no
4367                         * block group is mixed.  Just skip such block group.
4368                         */
4369                        btrfs_release_block_group(block_group, ffe_ctl->delalloc);
4370                        continue;
4371                }
4372
4373have_block_group:
4374                ffe_ctl->cached = btrfs_block_group_done(block_group);
4375                if (unlikely(!ffe_ctl->cached)) {
4376                        ffe_ctl->have_caching_bg = true;
4377                        ret = btrfs_cache_block_group(block_group, 0);
4378
4379                        /*
4380                         * If we get ENOMEM here or something else we want to
4381                         * try other block groups, because it may not be fatal.
4382                         * However if we can't find anything else we need to
4383                         * save our return here so that we return the actual
4384                         * error that caused problems, not ENOSPC.
4385                         */
4386                        if (ret < 0) {
4387                                if (!cache_block_group_error)
4388                                        cache_block_group_error = ret;
4389                                ret = 0;
4390                                goto loop;
4391                        }
4392                        ret = 0;
4393                }
4394
4395                if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
4396                        goto loop;
4397
4398                bg_ret = NULL;
4399                ret = do_allocation(block_group, ffe_ctl, &bg_ret);
4400                if (ret == 0) {
4401                        if (bg_ret && bg_ret != block_group) {
4402                                btrfs_release_block_group(block_group,
4403                                                          ffe_ctl->delalloc);
4404                                block_group = bg_ret;
4405                        }
4406                } else if (ret == -EAGAIN) {
4407                        goto have_block_group;
4408                } else if (ret > 0) {
4409                        goto loop;
4410                }
4411
4412                /* Checks */
4413                ffe_ctl->search_start = round_up(ffe_ctl->found_offset,
4414                                                 fs_info->stripesize);
4415
4416                /* move on to the next group */
4417                if (ffe_ctl->search_start + ffe_ctl->num_bytes >
4418                    block_group->start + block_group->length) {
4419                        btrfs_add_free_space_unused(block_group,
4420                                            ffe_ctl->found_offset,
4421                                            ffe_ctl->num_bytes);
4422                        goto loop;
4423                }
4424
4425                if (ffe_ctl->found_offset < ffe_ctl->search_start)
4426                        btrfs_add_free_space_unused(block_group,
4427                                        ffe_ctl->found_offset,
4428                                        ffe_ctl->search_start - ffe_ctl->found_offset);
4429
4430                ret = btrfs_add_reserved_bytes(block_group, ffe_ctl->ram_bytes,
4431                                               ffe_ctl->num_bytes,
4432                                               ffe_ctl->delalloc);
4433                if (ret == -EAGAIN) {
4434                        btrfs_add_free_space_unused(block_group,
4435                                        ffe_ctl->found_offset,
4436                                        ffe_ctl->num_bytes);
4437                        goto loop;
4438                }
4439                btrfs_inc_block_group_reservations(block_group);
4440
4441                /* we are all good, lets return */
4442                ins->objectid = ffe_ctl->search_start;
4443                ins->offset = ffe_ctl->num_bytes;
4444
4445                trace_btrfs_reserve_extent(block_group, ffe_ctl->search_start,
4446                                           ffe_ctl->num_bytes);
4447                btrfs_release_block_group(block_group, ffe_ctl->delalloc);
4448                break;
4449loop:
4450                release_block_group(block_group, ffe_ctl, ffe_ctl->delalloc);
4451                cond_resched();
4452        }
4453        up_read(&space_info->groups_sem);
4454
4455        ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, full_search);
4456        if (ret > 0)
4457                goto search;
4458
4459        if (ret == -ENOSPC && !cache_block_group_error) {
4460                /*
4461                 * Use ffe_ctl->total_free_space as fallback if we can't find
4462                 * any contiguous hole.
4463                 */
4464                if (!ffe_ctl->max_extent_size)
4465                        ffe_ctl->max_extent_size = ffe_ctl->total_free_space;
4466                spin_lock(&space_info->lock);
4467                space_info->max_extent_size = ffe_ctl->max_extent_size;
4468                spin_unlock(&space_info->lock);
4469                ins->offset = ffe_ctl->max_extent_size;
4470        } else if (ret == -ENOSPC) {
4471                ret = cache_block_group_error;
4472        }
4473        return ret;
4474}
4475
4476/*
4477 * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a
4478 *                        hole that is at least as big as @num_bytes.
4479 *
4480 * @root           -    The root that will contain this extent
4481 *
4482 * @ram_bytes      -    The amount of space in ram that @num_bytes take. This
4483 *                      is used for accounting purposes. This value differs
4484 *                      from @num_bytes only in the case of compressed extents.
4485 *
4486 * @num_bytes      -    Number of bytes to allocate on-disk.
4487 *
4488 * @min_alloc_size -    Indicates the minimum amount of space that the
4489 *                      allocator should try to satisfy. In some cases
4490 *                      @num_bytes may be larger than what is required and if
4491 *                      the filesystem is fragmented then allocation fails.
4492 *                      However, the presence of @min_alloc_size gives a
4493 *                      chance to try and satisfy the smaller allocation.
4494 *
4495 * @empty_size     -    A hint that you plan on doing more COW. This is the
4496 *                      size in bytes the allocator should try to find free
4497 *                      next to the block it returns.  This is just a hint and
4498 *                      may be ignored by the allocator.
4499 *
4500 * @hint_byte      -    Hint to the allocator to start searching above the byte
4501 *                      address passed. It might be ignored.
4502 *
4503 * @ins            -    This key is modified to record the found hole. It will
4504 *                      have the following values:
4505 *                      ins->objectid == start position
4506 *                      ins->flags = BTRFS_EXTENT_ITEM_KEY
4507 *                      ins->offset == the size of the hole.
4508 *
4509 * @is_data        -    Boolean flag indicating whether an extent is
4510 *                      allocated for data (true) or metadata (false)
4511 *
4512 * @delalloc       -    Boolean flag indicating whether this allocation is for
4513 *                      delalloc or not. If 'true' data_rwsem of block groups
4514 *                      is going to be acquired.
4515 *
4516 *
4517 * Returns 0 when an allocation succeeded or < 0 when an error occurred. In
4518 * case -ENOSPC is returned then @ins->offset will contain the size of the
4519 * largest available hole the allocator managed to find.
4520 */
4521int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
4522                         u64 num_bytes, u64 min_alloc_size,
4523                         u64 empty_size, u64 hint_byte,
4524                         struct btrfs_key *ins, int is_data, int delalloc)
4525{
4526        struct btrfs_fs_info *fs_info = root->fs_info;
4527        struct find_free_extent_ctl ffe_ctl = {};
4528        bool final_tried = num_bytes == min_alloc_size;
4529        u64 flags;
4530        int ret;
4531        bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
4532        bool for_data_reloc = (btrfs_is_data_reloc_root(root) && is_data);
4533
4534        flags = get_alloc_profile_by_root(root, is_data);
4535again:
4536        WARN_ON(num_bytes < fs_info->sectorsize);
4537
4538        ffe_ctl.ram_bytes = ram_bytes;
4539        ffe_ctl.num_bytes = num_bytes;
4540        ffe_ctl.min_alloc_size = min_alloc_size;
4541        ffe_ctl.empty_size = empty_size;
4542        ffe_ctl.flags = flags;
4543        ffe_ctl.delalloc = delalloc;
4544        ffe_ctl.hint_byte = hint_byte;
4545        ffe_ctl.for_treelog = for_treelog;
4546        ffe_ctl.for_data_reloc = for_data_reloc;
4547
4548        ret = find_free_extent(root, ins, &ffe_ctl);
4549        if (!ret && !is_data) {
4550                btrfs_dec_block_group_reservations(fs_info, ins->objectid);
4551        } else if (ret == -ENOSPC) {
4552                if (!final_tried && ins->offset) {
4553                        num_bytes = min(num_bytes >> 1, ins->offset);
4554                        num_bytes = round_down(num_bytes,
4555                                               fs_info->sectorsize);
4556                        num_bytes = max(num_bytes, min_alloc_size);
4557                        ram_bytes = num_bytes;
4558                        if (num_bytes == min_alloc_size)
4559                                final_tried = true;
4560                        goto again;
4561                } else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
4562                        struct btrfs_space_info *sinfo;
4563
4564                        sinfo = btrfs_find_space_info(fs_info, flags);
4565                        btrfs_err(fs_info,
4566        "allocation failed flags %llu, wanted %llu tree-log %d, relocation: %d",
4567                                  flags, num_bytes, for_treelog, for_data_reloc);
4568                        if (sinfo)
4569                                btrfs_dump_space_info(fs_info, sinfo,
4570                                                      num_bytes, 1);
4571                }
4572        }
4573
4574        return ret;
4575}
4576
4577int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
4578                               u64 start, u64 len, int delalloc)
4579{
4580        struct btrfs_block_group *cache;
4581
4582        cache = btrfs_lookup_block_group(fs_info, start);
4583        if (!cache) {
4584                btrfs_err(fs_info, "Unable to find block group for %llu",
4585                          start);
4586                return -ENOSPC;
4587        }
4588
4589        btrfs_add_free_space(cache, start, len);
4590        btrfs_free_reserved_bytes(cache, len, delalloc);
4591        trace_btrfs_reserved_extent_free(fs_info, start, len);
4592
4593        btrfs_put_block_group(cache);
4594        return 0;
4595}
4596
4597int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start,
4598                              u64 len)
4599{
4600        struct btrfs_block_group *cache;
4601        int ret = 0;
4602
4603        cache = btrfs_lookup_block_group(trans->fs_info, start);
4604        if (!cache) {
4605                btrfs_err(trans->fs_info, "unable to find block group for %llu",
4606                          start);
4607                return -ENOSPC;
4608        }
4609
4610        ret = pin_down_extent(trans, cache, start, len, 1);
4611        btrfs_put_block_group(cache);
4612        return ret;
4613}
4614
4615static int alloc_reserved_extent(struct btrfs_trans_handle *trans, u64 bytenr,
4616                                 u64 num_bytes)
4617{
4618        struct btrfs_fs_info *fs_info = trans->fs_info;
4619        int ret;
4620
4621        ret = remove_from_free_space_tree(trans, bytenr, num_bytes);
4622        if (ret)
4623                return ret;
4624
4625        ret = btrfs_update_block_group(trans, bytenr, num_bytes, true);
4626        if (ret) {
4627                ASSERT(!ret);
4628                btrfs_err(fs_info, "update block group failed for %llu %llu",
4629                          bytenr, num_bytes);
4630                return ret;
4631        }
4632
4633        trace_btrfs_reserved_extent_alloc(fs_info, bytenr, num_bytes);
4634        return 0;
4635}
4636
4637static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
4638                                      u64 parent, u64 root_objectid,
4639                                      u64 flags, u64 owner, u64 offset,
4640                                      struct btrfs_key *ins, int ref_mod)
4641{
4642        struct btrfs_fs_info *fs_info = trans->fs_info;
4643        struct btrfs_root *extent_root;
4644        int ret;
4645        struct btrfs_extent_item *extent_item;
4646        struct btrfs_extent_inline_ref *iref;
4647        struct btrfs_path *path;
4648        struct extent_buffer *leaf;
4649        int type;
4650        u32 size;
4651
4652        if (parent > 0)
4653                type = BTRFS_SHARED_DATA_REF_KEY;
4654        else
4655                type = BTRFS_EXTENT_DATA_REF_KEY;
4656
4657        size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
4658
4659        path = btrfs_alloc_path();
4660        if (!path)
4661                return -ENOMEM;
4662
4663        extent_root = btrfs_extent_root(fs_info, ins->objectid);
4664        ret = btrfs_insert_empty_item(trans, extent_root, path, ins, size);
4665        if (ret) {
4666                btrfs_free_path(path);
4667                return ret;
4668        }
4669
4670        leaf = path->nodes[0];
4671        extent_item = btrfs_item_ptr(leaf, path->slots[0],
4672                                     struct btrfs_extent_item);
4673        btrfs_set_extent_refs(leaf, extent_item, ref_mod);
4674        btrfs_set_extent_generation(leaf, extent_item, trans->transid);
4675        btrfs_set_extent_flags(leaf, extent_item,
4676                               flags | BTRFS_EXTENT_FLAG_DATA);
4677
4678        iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
4679        btrfs_set_extent_inline_ref_type(leaf, iref, type);
4680        if (parent > 0) {
4681                struct btrfs_shared_data_ref *ref;
4682                ref = (struct btrfs_shared_data_ref *)(iref + 1);
4683                btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
4684                btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
4685        } else {
4686                struct btrfs_extent_data_ref *ref;
4687                ref = (struct btrfs_extent_data_ref *)(&iref->offset);
4688                btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
4689                btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
4690                btrfs_set_extent_data_ref_offset(leaf, ref, offset);
4691                btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
4692        }
4693
4694        btrfs_mark_buffer_dirty(path->nodes[0]);
4695        btrfs_free_path(path);
4696
4697        return alloc_reserved_extent(trans, ins->objectid, ins->offset);
4698}
4699
4700static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
4701                                     struct btrfs_delayed_ref_node *node,
4702                                     struct btrfs_delayed_extent_op *extent_op)
4703{
4704        struct btrfs_fs_info *fs_info = trans->fs_info;
4705        struct btrfs_root *extent_root;
4706        int ret;
4707        struct btrfs_extent_item *extent_item;
4708        struct btrfs_key extent_key;
4709        struct btrfs_tree_block_info *block_info;
4710        struct btrfs_extent_inline_ref *iref;
4711        struct btrfs_path *path;
4712        struct extent_buffer *leaf;
4713        struct btrfs_delayed_tree_ref *ref;
4714        u32 size = sizeof(*extent_item) + sizeof(*iref);
4715        u64 flags = extent_op->flags_to_set;
4716        bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
4717
4718        ref = btrfs_delayed_node_to_tree_ref(node);
4719
4720        extent_key.objectid = node->bytenr;
4721        if (skinny_metadata) {
4722                extent_key.offset = ref->level;
4723                extent_key.type = BTRFS_METADATA_ITEM_KEY;
4724        } else {
4725                extent_key.offset = node->num_bytes;
4726                extent_key.type = BTRFS_EXTENT_ITEM_KEY;
4727                size += sizeof(*block_info);
4728        }
4729
4730        path = btrfs_alloc_path();
4731        if (!path)
4732                return -ENOMEM;
4733
4734        extent_root = btrfs_extent_root(fs_info, extent_key.objectid);
4735        ret = btrfs_insert_empty_item(trans, extent_root, path, &extent_key,
4736                                      size);
4737        if (ret) {
4738                btrfs_free_path(path);
4739                return ret;
4740        }
4741
4742        leaf = path->nodes[0];
4743        extent_item = btrfs_item_ptr(leaf, path->slots[0],
4744                                     struct btrfs_extent_item);
4745        btrfs_set_extent_refs(leaf, extent_item, 1);
4746        btrfs_set_extent_generation(leaf, extent_item, trans->transid);
4747        btrfs_set_extent_flags(leaf, extent_item,
4748                               flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
4749
4750        if (skinny_metadata) {
4751                iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
4752        } else {
4753                block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
4754                btrfs_set_tree_block_key(leaf, block_info, &extent_op->key);
4755                btrfs_set_tree_block_level(leaf, block_info, ref->level);
4756                iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
4757        }
4758
4759        if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
4760                btrfs_set_extent_inline_ref_type(leaf, iref,
4761                                                 BTRFS_SHARED_BLOCK_REF_KEY);
4762                btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent);
4763        } else {
4764                btrfs_set_extent_inline_ref_type(leaf, iref,
4765                                                 BTRFS_TREE_BLOCK_REF_KEY);
4766                btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root);
4767        }
4768
4769        btrfs_mark_buffer_dirty(leaf);
4770        btrfs_free_path(path);
4771
4772        return alloc_reserved_extent(trans, node->bytenr, fs_info->nodesize);
4773}
4774
4775int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
4776                                     struct btrfs_root *root, u64 owner,
4777                                     u64 offset, u64 ram_bytes,
4778                                     struct btrfs_key *ins)
4779{
4780        struct btrfs_ref generic_ref = { 0 };
4781
4782        BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
4783
4784        btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
4785                               ins->objectid, ins->offset, 0);
4786        btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner,
4787                            offset, 0, false);
4788        btrfs_ref_tree_mod(root->fs_info, &generic_ref);
4789
4790        return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes);
4791}
4792
4793/*
4794 * this is used by the tree logging recovery code.  It records that
4795 * an extent has been allocated and makes sure to clear the free
4796 * space cache bits as well
4797 */
4798int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
4799                                   u64 root_objectid, u64 owner, u64 offset,
4800                                   struct btrfs_key *ins)
4801{
4802        struct btrfs_fs_info *fs_info = trans->fs_info;
4803        int ret;
4804        struct btrfs_block_group *block_group;
4805        struct btrfs_space_info *space_info;
4806
4807        /*
4808         * Mixed block groups will exclude before processing the log so we only
4809         * need to do the exclude dance if this fs isn't mixed.
4810         */
4811        if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
4812                ret = __exclude_logged_extent(fs_info, ins->objectid,
4813                                              ins->offset);
4814                if (ret)
4815                        return ret;
4816        }
4817
4818        block_group = btrfs_lookup_block_group(fs_info, ins->objectid);
4819        if (!block_group)
4820                return -EINVAL;
4821
4822        space_info = block_group->space_info;
4823        spin_lock(&space_info->lock);
4824        spin_lock(&block_group->lock);
4825        space_info->bytes_reserved += ins->offset;
4826        block_group->reserved += ins->offset;
4827        spin_unlock(&block_group->lock);
4828        spin_unlock(&space_info->lock);
4829
4830        ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
4831                                         offset, ins, 1);
4832        if (ret)
4833                btrfs_pin_extent(trans, ins->objectid, ins->offset, 1);
4834        btrfs_put_block_group(block_group);
4835        return ret;
4836}
4837
4838static struct extent_buffer *
4839btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4840                      u64 bytenr, int level, u64 owner,
4841                      enum btrfs_lock_nesting nest)
4842{
4843        struct btrfs_fs_info *fs_info = root->fs_info;
4844        struct extent_buffer *buf;
4845
4846        buf = btrfs_find_create_tree_block(fs_info, bytenr, owner, level);
4847        if (IS_ERR(buf))
4848                return buf;
4849
4850        /*
4851         * Extra safety check in case the extent tree is corrupted and extent
4852         * allocator chooses to use a tree block which is already used and
4853         * locked.
4854         */
4855        if (buf->lock_owner == current->pid) {
4856                btrfs_err_rl(fs_info,
4857"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
4858                        buf->start, btrfs_header_owner(buf), current->pid);
4859                free_extent_buffer(buf);
4860                return ERR_PTR(-EUCLEAN);
4861        }
4862
4863        /*
4864         * This needs to stay, because we could allocate a freed block from an
4865         * old tree into a new tree, so we need to make sure this new block is
4866         * set to the appropriate level and owner.
4867         */
4868        btrfs_set_buffer_lockdep_class(owner, buf, level);
4869        __btrfs_tree_lock(buf, nest);
4870        btrfs_clean_tree_block(buf);
4871        clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
4872        clear_bit(EXTENT_BUFFER_NO_CHECK, &buf->bflags);
4873
4874        set_extent_buffer_uptodate(buf);
4875
4876        memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
4877        btrfs_set_header_level(buf, level);
4878        btrfs_set_header_bytenr(buf, buf->start);
4879        btrfs_set_header_generation(buf, trans->transid);
4880        btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV);
4881        btrfs_set_header_owner(buf, owner);
4882        write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid);
4883        write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid);
4884        if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
4885                buf->log_index = root->log_transid % 2;
4886                /*
4887                 * we allow two log transactions at a time, use different
4888                 * EXTENT bit to differentiate dirty pages.
4889                 */
4890                if (buf->log_index == 0)
4891                        set_extent_dirty(&root->dirty_log_pages, buf->start,
4892                                        buf->start + buf->len - 1, GFP_NOFS);
4893                else
4894                        set_extent_new(&root->dirty_log_pages, buf->start,
4895                                        buf->start + buf->len - 1);
4896        } else {
4897                buf->log_index = -1;
4898                set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
4899                         buf->start + buf->len - 1, GFP_NOFS);
4900        }
4901        /* this returns a buffer locked for blocking */
4902        return buf;
4903}
4904
4905/*
4906 * finds a free extent and does all the dirty work required for allocation
4907 * returns the tree buffer or an ERR_PTR on error.
4908 */
4909struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
4910                                             struct btrfs_root *root,
4911                                             u64 parent, u64 root_objectid,
4912                                             const struct btrfs_disk_key *key,
4913                                             int level, u64 hint,
4914                                             u64 empty_size,
4915                                             enum btrfs_lock_nesting nest)
4916{
4917        struct btrfs_fs_info *fs_info = root->fs_info;
4918        struct btrfs_key ins;
4919        struct btrfs_block_rsv *block_rsv;
4920        struct extent_buffer *buf;
4921        struct btrfs_delayed_extent_op *extent_op;
4922        struct btrfs_ref generic_ref = { 0 };
4923        u64 flags = 0;
4924        int ret;
4925        u32 blocksize = fs_info->nodesize;
4926        bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
4927
4928#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
4929        if (btrfs_is_testing(fs_info)) {
4930                buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
4931                                            level, root_objectid, nest);
4932                if (!IS_ERR(buf))
4933                        root->alloc_bytenr += blocksize;
4934                return buf;
4935        }
4936#endif
4937
4938        block_rsv = btrfs_use_block_rsv(trans, root, blocksize);
4939        if (IS_ERR(block_rsv))
4940                return ERR_CAST(block_rsv);
4941
4942        ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
4943                                   empty_size, hint, &ins, 0, 0);
4944        if (ret)
4945                goto out_unuse;
4946
4947        buf = btrfs_init_new_buffer(trans, root, ins.objectid, level,
4948                                    root_objectid, nest);
4949        if (IS_ERR(buf)) {
4950                ret = PTR_ERR(buf);
4951                goto out_free_reserved;
4952        }
4953
4954        if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
4955                if (parent == 0)
4956                        parent = ins.objectid;
4957                flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
4958        } else
4959                BUG_ON(parent > 0);
4960
4961        if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
4962                extent_op = btrfs_alloc_delayed_extent_op();
4963                if (!extent_op) {
4964                        ret = -ENOMEM;
4965                        goto out_free_buf;
4966                }
4967                if (key)
4968                        memcpy(&extent_op->key, key, sizeof(extent_op->key));
4969                else
4970                        memset(&extent_op->key, 0, sizeof(extent_op->key));
4971                extent_op->flags_to_set = flags;
4972                extent_op->update_key = skinny_metadata ? false : true;
4973                extent_op->update_flags = true;
4974                extent_op->level = level;
4975
4976                btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
4977                                       ins.objectid, ins.offset, parent);
4978                btrfs_init_tree_ref(&generic_ref, level, root_objectid,
4979                                    root->root_key.objectid, false);
4980                btrfs_ref_tree_mod(fs_info, &generic_ref);
4981                ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op);
4982                if (ret)
4983                        goto out_free_delayed;
4984        }
4985        return buf;
4986
4987out_free_delayed:
4988        btrfs_free_delayed_extent_op(extent_op);
4989out_free_buf:
4990        btrfs_tree_unlock(buf);
4991        free_extent_buffer(buf);
4992out_free_reserved:
4993        btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
4994out_unuse:
4995        btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize);
4996        return ERR_PTR(ret);
4997}
4998
4999struct walk_control {
5000        u64 refs[BTRFS_MAX_LEVEL];
5001        u64 flags[BTRFS_MAX_LEVEL];
5002        struct btrfs_key update_progress;
5003        struct btrfs_key drop_progress;
5004        int drop_level;
5005        int stage;
5006        int level;
5007        int shared_level;
5008        int update_ref;
5009        int keep_locks;
5010        int reada_slot;
5011        int reada_count;
5012        int restarted;
5013};
5014
5015#define DROP_REFERENCE  1
5016#define UPDATE_BACKREF  2
5017
5018static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
5019                                     struct btrfs_root *root,
5020                                     struct walk_control *wc,
5021                                     struct btrfs_path *path)
5022{
5023        struct btrfs_fs_info *fs_info = root->fs_info;
5024        u64 bytenr;
5025        u64 generation;
5026        u64 refs;
5027        u64 flags;
5028        u32 nritems;
5029        struct btrfs_key key;
5030        struct extent_buffer *eb;
5031        int ret;
5032        int slot;
5033        int nread = 0;
5034
5035        if (path->slots[wc->level] < wc->reada_slot) {
5036                wc->reada_count = wc->reada_count * 2 / 3;
5037                wc->reada_count = max(wc->reada_count, 2);
5038        } else {
5039                wc->reada_count = wc->reada_count * 3 / 2;
5040                wc->reada_count = min_t(int, wc->reada_count,
5041                                        BTRFS_NODEPTRS_PER_BLOCK(fs_info));
5042        }
5043
5044        eb = path->nodes[wc->level];
5045        nritems = btrfs_header_nritems(eb);
5046
5047        for (slot = path->slots[wc->level]; slot < nritems; slot++) {
5048                if (nread >= wc->reada_count)
5049                        break;
5050
5051                cond_resched();
5052                bytenr = btrfs_node_blockptr(eb, slot);
5053                generation = btrfs_node_ptr_generation(eb, slot);
5054
5055                if (slot == path->slots[wc->level])
5056                        goto reada;
5057
5058                if (wc->stage == UPDATE_BACKREF &&
5059                    generation <= root->root_key.offset)
5060                        continue;
5061
5062                /* We don't lock the tree block, it's OK to be racy here */
5063                ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
5064                                               wc->level - 1, 1, &refs,
5065                                               &flags);
5066                /* We don't care about errors in readahead. */
5067                if (ret < 0)
5068                        continue;
5069                BUG_ON(refs == 0);
5070
5071                if (wc->stage == DROP_REFERENCE) {
5072                        if (refs == 1)
5073                                goto reada;
5074
5075                        if (wc->level == 1 &&
5076                            (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5077                                continue;
5078                        if (!wc->update_ref ||
5079                            generation <= root->root_key.offset)
5080                                continue;
5081                        btrfs_node_key_to_cpu(eb, &key, slot);
5082                        ret = btrfs_comp_cpu_keys(&key,
5083                                                  &wc->update_progress);
5084                        if (ret < 0)
5085                                continue;
5086                } else {
5087                        if (wc->level == 1 &&
5088                            (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5089                                continue;
5090                }
5091reada:
5092                btrfs_readahead_node_child(eb, slot);
5093                nread++;
5094        }
5095        wc->reada_slot = slot;
5096}
5097
5098/*
5099 * helper to process tree block while walking down the tree.
5100 *
5101 * when wc->stage == UPDATE_BACKREF, this function updates
5102 * back refs for pointers in the block.
5103 *
5104 * NOTE: return value 1 means we should stop walking down.
5105 */
5106static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
5107                                   struct btrfs_root *root,
5108                                   struct btrfs_path *path,
5109                                   struct walk_control *wc, int lookup_info)
5110{
5111        struct btrfs_fs_info *fs_info = root->fs_info;
5112        int level = wc->level;
5113        struct extent_buffer *eb = path->nodes[level];
5114        u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
5115        int ret;
5116
5117        if (wc->stage == UPDATE_BACKREF &&
5118            btrfs_header_owner(eb) != root->root_key.objectid)
5119                return 1;
5120
5121        /*
5122         * when reference count of tree block is 1, it won't increase
5123         * again. once full backref flag is set, we never clear it.
5124         */
5125        if (lookup_info &&
5126            ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
5127             (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
5128                BUG_ON(!path->locks[level]);
5129                ret = btrfs_lookup_extent_info(trans, fs_info,
5130                                               eb->start, level, 1,
5131                                               &wc->refs[level],
5132                                               &wc->flags[level]);
5133                BUG_ON(ret == -ENOMEM);
5134                if (ret)
5135                        return ret;
5136                BUG_ON(wc->refs[level] == 0);
5137        }
5138
5139        if (wc->stage == DROP_REFERENCE) {
5140                if (wc->refs[level] > 1)
5141                        return 1;
5142
5143                if (path->locks[level] && !wc->keep_locks) {
5144                        btrfs_tree_unlock_rw(eb, path->locks[level]);
5145                        path->locks[level] = 0;
5146                }
5147                return 0;
5148        }
5149
5150        /* wc->stage == UPDATE_BACKREF */
5151        if (!(wc->flags[level] & flag)) {
5152                BUG_ON(!path->locks[level]);
5153                ret = btrfs_inc_ref(trans, root, eb, 1);
5154                BUG_ON(ret); /* -ENOMEM */
5155                ret = btrfs_dec_ref(trans, root, eb, 0);
5156                BUG_ON(ret); /* -ENOMEM */
5157                ret = btrfs_set_disk_extent_flags(trans, eb, flag,
5158                                                  btrfs_header_level(eb));
5159                BUG_ON(ret); /* -ENOMEM */
5160                wc->flags[level] |= flag;
5161        }
5162
5163        /*
5164         * the block is shared by multiple trees, so it's not good to
5165         * keep the tree lock
5166         */
5167        if (path->locks[level] && level > 0) {
5168                btrfs_tree_unlock_rw(eb, path->locks[level]);
5169                path->locks[level] = 0;
5170        }
5171        return 0;
5172}
5173
5174/*
5175 * This is used to verify a ref exists for this root to deal with a bug where we
5176 * would have a drop_progress key that hadn't been updated properly.
5177 */
5178static int check_ref_exists(struct btrfs_trans_handle *trans,
5179                            struct btrfs_root *root, u64 bytenr, u64 parent,
5180                            int level)
5181{
5182        struct btrfs_path *path;
5183        struct btrfs_extent_inline_ref *iref;
5184        int ret;
5185
5186        path = btrfs_alloc_path();
5187        if (!path)
5188                return -ENOMEM;
5189
5190        ret = lookup_extent_backref(trans, path, &iref, bytenr,
5191                                    root->fs_info->nodesize, parent,
5192                                    root->root_key.objectid, level, 0);
5193        btrfs_free_path(path);
5194        if (ret == -ENOENT)
5195                return 0;
5196        if (ret < 0)
5197                return ret;
5198        return 1;
5199}
5200
5201/*
5202 * helper to process tree block pointer.
5203 *
5204 * when wc->stage == DROP_REFERENCE, this function checks
5205 * reference count of the block pointed to. if the block
5206 * is shared and we need update back refs for the subtree
5207 * rooted at the block, this function changes wc->stage to
5208 * UPDATE_BACKREF. if the block is shared and there is no
5209 * need to update back, this function drops the reference
5210 * to the block.
5211 *
5212 * NOTE: return value 1 means we should stop walking down.
5213 */
5214static noinline int do_walk_down(struct btrfs_trans_handle *trans,
5215                                 struct btrfs_root *root,
5216                                 struct btrfs_path *path,
5217                                 struct walk_control *wc, int *lookup_info)
5218{
5219        struct btrfs_fs_info *fs_info = root->fs_info;
5220        u64 bytenr;
5221        u64 generation;
5222        u64 parent;
5223        struct btrfs_key key;
5224        struct btrfs_key first_key;
5225        struct btrfs_ref ref = { 0 };
5226        struct extent_buffer *next;
5227        int level = wc->level;
5228        int reada = 0;
5229        int ret = 0;
5230        bool need_account = false;
5231
5232        generation = btrfs_node_ptr_generation(path->nodes[level],
5233                                               path->slots[level]);
5234        /*
5235         * if the lower level block was created before the snapshot
5236         * was created, we know there is no need to update back refs
5237         * for the subtree
5238         */
5239        if (wc->stage == UPDATE_BACKREF &&
5240            generation <= root->root_key.offset) {
5241                *lookup_info = 1;
5242                return 1;
5243        }
5244
5245        bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
5246        btrfs_node_key_to_cpu(path->nodes[level], &first_key,
5247                              path->slots[level]);
5248
5249        next = find_extent_buffer(fs_info, bytenr);
5250        if (!next) {
5251                next = btrfs_find_create_tree_block(fs_info, bytenr,
5252                                root->root_key.objectid, level - 1);
5253                if (IS_ERR(next))
5254                        return PTR_ERR(next);
5255                reada = 1;
5256        }
5257        btrfs_tree_lock(next);
5258
5259        ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
5260                                       &wc->refs[level - 1],
5261                                       &wc->flags[level - 1]);
5262        if (ret < 0)
5263                goto out_unlock;
5264
5265        if (unlikely(wc->refs[level - 1] == 0)) {
5266                btrfs_err(fs_info, "Missing references.");
5267                ret = -EIO;
5268                goto out_unlock;
5269        }
5270        *lookup_info = 0;
5271
5272        if (wc->stage == DROP_REFERENCE) {
5273                if (wc->refs[level - 1] > 1) {
5274                        need_account = true;
5275                        if (level == 1 &&
5276                            (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5277                                goto skip;
5278
5279                        if (!wc->update_ref ||
5280                            generation <= root->root_key.offset)
5281                                goto skip;
5282
5283                        btrfs_node_key_to_cpu(path->nodes[level], &key,
5284                                              path->slots[level]);
5285                        ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
5286                        if (ret < 0)
5287                                goto skip;
5288
5289                        wc->stage = UPDATE_BACKREF;
5290                        wc->shared_level = level - 1;
5291                }
5292        } else {
5293                if (level == 1 &&
5294                    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5295                        goto skip;
5296        }
5297
5298        if (!btrfs_buffer_uptodate(next, generation, 0)) {
5299                btrfs_tree_unlock(next);
5300                free_extent_buffer(next);
5301                next = NULL;
5302                *lookup_info = 1;
5303        }
5304
5305        if (!next) {
5306                if (reada && level == 1)
5307                        reada_walk_down(trans, root, wc, path);
5308                next = read_tree_block(fs_info, bytenr, root->root_key.objectid,
5309                                       generation, level - 1, &first_key);
5310                if (IS_ERR(next)) {
5311                        return PTR_ERR(next);
5312                } else if (!extent_buffer_uptodate(next)) {
5313                        free_extent_buffer(next);
5314                        return -EIO;
5315                }
5316                btrfs_tree_lock(next);
5317        }
5318
5319        level--;
5320        ASSERT(level == btrfs_header_level(next));
5321        if (level != btrfs_header_level(next)) {
5322                btrfs_err(root->fs_info, "mismatched level");
5323                ret = -EIO;
5324                goto out_unlock;
5325        }
5326        path->nodes[level] = next;
5327        path->slots[level] = 0;
5328        path->locks[level] = BTRFS_WRITE_LOCK;
5329        wc->level = level;
5330        if (wc->level == 1)
5331                wc->reada_slot = 0;
5332        return 0;
5333skip:
5334        wc->refs[level - 1] = 0;
5335        wc->flags[level - 1] = 0;
5336        if (wc->stage == DROP_REFERENCE) {
5337                if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
5338                        parent = path->nodes[level]->start;
5339                } else {
5340                        ASSERT(root->root_key.objectid ==
5341                               btrfs_header_owner(path->nodes[level]));
5342                        if (root->root_key.objectid !=
5343                            btrfs_header_owner(path->nodes[level])) {
5344                                btrfs_err(root->fs_info,
5345                                                "mismatched block owner");
5346                                ret = -EIO;
5347                                goto out_unlock;
5348                        }
5349                        parent = 0;
5350                }
5351
5352                /*
5353                 * If we had a drop_progress we need to verify the refs are set
5354                 * as expected.  If we find our ref then we know that from here
5355                 * on out everything should be correct, and we can clear the
5356                 * ->restarted flag.
5357                 */
5358                if (wc->restarted) {
5359                        ret = check_ref_exists(trans, root, bytenr, parent,
5360                                               level - 1);
5361                        if (ret < 0)
5362                                goto out_unlock;
5363                        if (ret == 0)
5364                                goto no_delete;
5365                        ret = 0;
5366                        wc->restarted = 0;
5367                }
5368
5369                /*
5370                 * Reloc tree doesn't contribute to qgroup numbers, and we have
5371                 * already accounted them at merge time (replace_path),
5372                 * thus we could skip expensive subtree trace here.
5373                 */
5374                if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
5375                    need_account) {
5376                        ret = btrfs_qgroup_trace_subtree(trans, next,
5377                                                         generation, level - 1);
5378                        if (ret) {
5379                                btrfs_err_rl(fs_info,
5380                                             "Error %d accounting shared subtree. Quota is out of sync, rescan required.",
5381                                             ret);
5382                        }
5383                }
5384
5385                /*
5386                 * We need to update the next key in our walk control so we can
5387                 * update the drop_progress key accordingly.  We don't care if
5388                 * find_next_key doesn't find a key because that means we're at
5389                 * the end and are going to clean up now.
5390                 */
5391                wc->drop_level = level;
5392                find_next_key(path, level, &wc->drop_progress);
5393
5394                btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
5395                                       fs_info->nodesize, parent);
5396                btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid,
5397                                    0, false);
5398                ret = btrfs_free_extent(trans, &ref);
5399                if (ret)
5400                        goto out_unlock;
5401        }
5402no_delete:
5403        *lookup_info = 1;
5404        ret = 1;
5405
5406out_unlock:
5407        btrfs_tree_unlock(next);
5408        free_extent_buffer(next);
5409
5410        return ret;
5411}
5412
5413/*
5414 * helper to process tree block while walking up the tree.
5415 *
5416 * when wc->stage == DROP_REFERENCE, this function drops
5417 * reference count on the block.
5418 *
5419 * when wc->stage == UPDATE_BACKREF, this function changes
5420 * wc->stage back to DROP_REFERENCE if we changed wc->stage
5421 * to UPDATE_BACKREF previously while processing the block.
5422 *
5423 * NOTE: return value 1 means we should stop walking up.
5424 */
5425static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
5426                                 struct btrfs_root *root,
5427                                 struct btrfs_path *path,
5428                                 struct walk_control *wc)
5429{
5430        struct btrfs_fs_info *fs_info = root->fs_info;
5431        int ret;
5432        int level = wc->level;
5433        struct extent_buffer *eb = path->nodes[level];
5434        u64 parent = 0;
5435
5436        if (wc->stage == UPDATE_BACKREF) {
5437                BUG_ON(wc->shared_level < level);
5438                if (level < wc->shared_level)
5439                        goto out;
5440
5441                ret = find_next_key(path, level + 1, &wc->update_progress);
5442                if (ret > 0)
5443                        wc->update_ref = 0;
5444
5445                wc->stage = DROP_REFERENCE;
5446                wc->shared_level = -1;
5447                path->slots[level] = 0;
5448
5449                /*
5450                 * check reference count again if the block isn't locked.
5451                 * we should start walking down the tree again if reference
5452                 * count is one.
5453                 */
5454                if (!path->locks[level]) {
5455                        BUG_ON(level == 0);
5456                        btrfs_tree_lock(eb);
5457                        path->locks[level] = BTRFS_WRITE_LOCK;
5458
5459                        ret = btrfs_lookup_extent_info(trans, fs_info,
5460                                                       eb->start, level, 1,
5461                                                       &wc->refs[level],
5462                                                       &wc->flags[level]);
5463                        if (ret < 0) {
5464                                btrfs_tree_unlock_rw(eb, path->locks[level]);
5465                                path->locks[level] = 0;
5466                                return ret;
5467                        }
5468                        BUG_ON(wc->refs[level] == 0);
5469                        if (wc->refs[level] == 1) {
5470                                btrfs_tree_unlock_rw(eb, path->locks[level]);
5471                                path->locks[level] = 0;
5472                                return 1;
5473                        }
5474                }
5475        }
5476
5477        /* wc->stage == DROP_REFERENCE */
5478        BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
5479
5480        if (wc->refs[level] == 1) {
5481                if (level == 0) {
5482                        if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5483                                ret = btrfs_dec_ref(trans, root, eb, 1);
5484                        else
5485                                ret = btrfs_dec_ref(trans, root, eb, 0);
5486                        BUG_ON(ret); /* -ENOMEM */
5487                        if (is_fstree(root->root_key.objectid)) {
5488                                ret = btrfs_qgroup_trace_leaf_items(trans, eb);
5489                                if (ret) {
5490                                        btrfs_err_rl(fs_info,
5491        "error %d accounting leaf items, quota is out of sync, rescan required",
5492                                             ret);
5493                                }
5494                        }
5495                }
5496                /* make block locked assertion in btrfs_clean_tree_block happy */
5497                if (!path->locks[level] &&
5498                    btrfs_header_generation(eb) == trans->transid) {
5499                        btrfs_tree_lock(eb);
5500                        path->locks[level] = BTRFS_WRITE_LOCK;
5501                }
5502                btrfs_clean_tree_block(eb);
5503        }
5504
5505        if (eb == root->node) {
5506                if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5507                        parent = eb->start;
5508                else if (root->root_key.objectid != btrfs_header_owner(eb))
5509                        goto owner_mismatch;
5510        } else {
5511                if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5512                        parent = path->nodes[level + 1]->start;
5513                else if (root->root_key.objectid !=
5514                         btrfs_header_owner(path->nodes[level + 1]))
5515                        goto owner_mismatch;
5516        }
5517
5518        btrfs_free_tree_block(trans, btrfs_root_id(root), eb, parent,
5519                              wc->refs[level] == 1);
5520out:
5521        wc->refs[level] = 0;
5522        wc->flags[level] = 0;
5523        return 0;
5524
5525owner_mismatch:
5526        btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu",
5527                     btrfs_header_owner(eb), root->root_key.objectid);
5528        return -EUCLEAN;
5529}
5530
5531static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
5532                                   struct btrfs_root *root,
5533                                   struct btrfs_path *path,
5534                                   struct walk_control *wc)
5535{
5536        int level = wc->level;
5537        int lookup_info = 1;
5538        int ret;
5539
5540        while (level >= 0) {
5541                ret = walk_down_proc(trans, root, path, wc, lookup_info);
5542                if (ret > 0)
5543                        break;
5544
5545                if (level == 0)
5546                        break;
5547
5548                if (path->slots[level] >=
5549                    btrfs_header_nritems(path->nodes[level]))
5550                        break;
5551
5552                ret = do_walk_down(trans, root, path, wc, &lookup_info);
5553                if (ret > 0) {
5554                        path->slots[level]++;
5555                        continue;
5556                } else if (ret < 0)
5557                        return ret;
5558                level = wc->level;
5559        }
5560        return 0;
5561}
5562
5563static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
5564                                 struct btrfs_root *root,
5565                                 struct btrfs_path *path,
5566                                 struct walk_control *wc, int max_level)
5567{
5568        int level = wc->level;
5569        int ret;
5570
5571        path->slots[level] = btrfs_header_nritems(path->nodes[level]);
5572        while (level < max_level && path->nodes[level]) {
5573                wc->level = level;
5574                if (path->slots[level] + 1 <
5575                    btrfs_header_nritems(path->nodes[level])) {
5576                        path->slots[level]++;
5577                        return 0;
5578                } else {
5579                        ret = walk_up_proc(trans, root, path, wc);
5580                        if (ret > 0)
5581                                return 0;
5582                        if (ret < 0)
5583                                return ret;
5584
5585                        if (path->locks[level]) {
5586                                btrfs_tree_unlock_rw(path->nodes[level],
5587                                                     path->locks[level]);
5588                                path->locks[level] = 0;
5589                        }
5590                        free_extent_buffer(path->nodes[level]);
5591                        path->nodes[level] = NULL;
5592                        level++;
5593                }
5594        }
5595        return 1;
5596}
5597
5598/*
5599 * drop a subvolume tree.
5600 *
5601 * this function traverses the tree freeing any blocks that only
5602 * referenced by the tree.
5603 *
5604 * when a shared tree block is found. this function decreases its
5605 * reference count by one. if update_ref is true, this function
5606 * also make sure backrefs for the shared block and all lower level
5607 * blocks are properly updated.
5608 *
5609 * If called with for_reloc == 0, may exit early with -EAGAIN
5610 */
5611int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
5612{
5613        struct btrfs_fs_info *fs_info = root->fs_info;
5614        struct btrfs_path *path;
5615        struct btrfs_trans_handle *trans;
5616        struct btrfs_root *tree_root = fs_info->tree_root;
5617        struct btrfs_root_item *root_item = &root->root_item;
5618        struct walk_control *wc;
5619        struct btrfs_key key;
5620        int err = 0;
5621        int ret;
5622        int level;
5623        bool root_dropped = false;
5624        bool unfinished_drop = false;
5625
5626        btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);
5627
5628        path = btrfs_alloc_path();
5629        if (!path) {
5630                err = -ENOMEM;
5631                goto out;
5632        }
5633
5634        wc = kzalloc(sizeof(*wc), GFP_NOFS);
5635        if (!wc) {
5636                btrfs_free_path(path);
5637                err = -ENOMEM;
5638                goto out;
5639        }
5640
5641        /*
5642         * Use join to avoid potential EINTR from transaction start. See
5643         * wait_reserve_ticket and the whole reservation callchain.
5644         */
5645        if (for_reloc)
5646                trans = btrfs_join_transaction(tree_root);
5647        else
5648                trans = btrfs_start_transaction(tree_root, 0);
5649        if (IS_ERR(trans)) {
5650                err = PTR_ERR(trans);
5651                goto out_free;
5652        }
5653
5654        err = btrfs_run_delayed_items(trans);
5655        if (err)
5656                goto out_end_trans;
5657
5658        /*
5659         * This will help us catch people modifying the fs tree while we're
5660         * dropping it.  It is unsafe to mess with the fs tree while it's being
5661         * dropped as we unlock the root node and parent nodes as we walk down
5662         * the tree, assuming nothing will change.  If something does change
5663         * then we'll have stale information and drop references to blocks we've
5664         * already dropped.
5665         */
5666        set_bit(BTRFS_ROOT_DELETING, &root->state);
5667        unfinished_drop = test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state);
5668
5669        if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
5670                level = btrfs_header_level(root->node);
5671                path->nodes[level] = btrfs_lock_root_node(root);
5672                path->slots[level] = 0;
5673                path->locks[level] = BTRFS_WRITE_LOCK;
5674                memset(&wc->update_progress, 0,
5675                       sizeof(wc->update_progress));
5676        } else {
5677                btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
5678                memcpy(&wc->update_progress, &key,
5679                       sizeof(wc->update_progress));
5680
5681                level = btrfs_root_drop_level(root_item);
5682                BUG_ON(level == 0);
5683                path->lowest_level = level;
5684                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5685                path->lowest_level = 0;
5686                if (ret < 0) {
5687                        err = ret;
5688                        goto out_end_trans;
5689                }
5690                WARN_ON(ret > 0);
5691
5692                /*
5693                 * unlock our path, this is safe because only this
5694                 * function is allowed to delete this snapshot
5695                 */
5696                btrfs_unlock_up_safe(path, 0);
5697
5698                level = btrfs_header_level(root->node);
5699                while (1) {
5700                        btrfs_tree_lock(path->nodes[level]);
5701                        path->locks[level] = BTRFS_WRITE_LOCK;
5702
5703                        ret = btrfs_lookup_extent_info(trans, fs_info,
5704                                                path->nodes[level]->start,
5705                                                level, 1, &wc->refs[level],
5706                                                &wc->flags[level]);
5707                        if (ret < 0) {
5708                                err = ret;
5709                                goto out_end_trans;
5710                        }
5711                        BUG_ON(wc->refs[level] == 0);
5712
5713                        if (level == btrfs_root_drop_level(root_item))
5714                                break;
5715
5716                        btrfs_tree_unlock(path->nodes[level]);
5717                        path->locks[level] = 0;
5718                        WARN_ON(wc->refs[level] != 1);
5719                        level--;
5720                }
5721        }
5722
5723        wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
5724        wc->level = level;
5725        wc->shared_level = -1;
5726        wc->stage = DROP_REFERENCE;
5727        wc->update_ref = update_ref;
5728        wc->keep_locks = 0;
5729        wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
5730
5731        while (1) {
5732
5733                ret = walk_down_tree(trans, root, path, wc);
5734                if (ret < 0) {
5735                        err = ret;
5736                        break;
5737                }
5738
5739                ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
5740                if (ret < 0) {
5741                        err = ret;
5742                        break;
5743                }
5744
5745                if (ret > 0) {
5746                        BUG_ON(wc->stage != DROP_REFERENCE);
5747                        break;
5748                }
5749
5750                if (wc->stage == DROP_REFERENCE) {
5751                        wc->drop_level = wc->level;
5752                        btrfs_node_key_to_cpu(path->nodes[wc->drop_level],
5753                                              &wc->drop_progress,
5754                                              path->slots[wc->drop_level]);
5755                }
5756                btrfs_cpu_key_to_disk(&root_item->drop_progress,
5757                                      &wc->drop_progress);
5758                btrfs_set_root_drop_level(root_item, wc->drop_level);
5759
5760                BUG_ON(wc->level == 0);
5761                if (btrfs_should_end_transaction(trans) ||
5762                    (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) {
5763                        ret = btrfs_update_root(trans, tree_root,
5764                                                &root->root_key,
5765                                                root_item);
5766                        if (ret) {
5767                                btrfs_abort_transaction(trans, ret);
5768                                err = ret;
5769                                goto out_end_trans;
5770                        }
5771
5772                        btrfs_end_transaction_throttle(trans);
5773                        if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
5774                                btrfs_debug(fs_info,
5775                                            "drop snapshot early exit");
5776                                err = -EAGAIN;
5777                                goto out_free;
5778                        }
5779
5780                       /*
5781                        * Use join to avoid potential EINTR from transaction
5782                        * start. See wait_reserve_ticket and the whole
5783                        * reservation callchain.
5784                        */
5785                        if (for_reloc)
5786                                trans = btrfs_join_transaction(tree_root);
5787                        else
5788                                trans = btrfs_start_transaction(tree_root, 0);
5789                        if (IS_ERR(trans)) {
5790                                err = PTR_ERR(trans);
5791                                goto out_free;
5792                        }
5793                }
5794        }
5795        btrfs_release_path(path);
5796        if (err)
5797                goto out_end_trans;
5798
5799        ret = btrfs_del_root(trans, &root->root_key);
5800        if (ret) {
5801                btrfs_abort_transaction(trans, ret);
5802                err = ret;
5803                goto out_end_trans;
5804        }
5805
5806        if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
5807                ret = btrfs_find_root(tree_root, &root->root_key, path,
5808                                      NULL, NULL);
5809                if (ret < 0) {
5810                        btrfs_abort_transaction(trans, ret);
5811                        err = ret;
5812                        goto out_end_trans;
5813                } else if (ret > 0) {
5814                        /* if we fail to delete the orphan item this time
5815                         * around, it'll get picked up the next time.
5816                         *
5817                         * The most common failure here is just -ENOENT.
5818                         */
5819                        btrfs_del_orphan_item(trans, tree_root,
5820                                              root->root_key.objectid);
5821                }
5822        }
5823
5824        /*
5825         * This subvolume is going to be completely dropped, and won't be
5826         * recorded as dirty roots, thus pertrans meta rsv will not be freed at
5827         * commit transaction time.  So free it here manually.
5828         */
5829        btrfs_qgroup_convert_reserved_meta(root, INT_MAX);
5830        btrfs_qgroup_free_meta_all_pertrans(root);
5831
5832        if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state))
5833                btrfs_add_dropped_root(trans, root);
5834        else
5835                btrfs_put_root(root);
5836        root_dropped = true;
5837out_end_trans:
5838        btrfs_end_transaction_throttle(trans);
5839out_free:
5840        kfree(wc);
5841        btrfs_free_path(path);
5842out:
5843        /*
5844         * We were an unfinished drop root, check to see if there are any
5845         * pending, and if not clear and wake up any waiters.
5846         */
5847        if (!err && unfinished_drop)
5848                btrfs_maybe_wake_unfinished_drop(fs_info);
5849
5850        /*
5851         * So if we need to stop dropping the snapshot for whatever reason we
5852         * need to make sure to add it back to the dead root list so that we
5853         * keep trying to do the work later.  This also cleans up roots if we
5854         * don't have it in the radix (like when we recover after a power fail
5855         * or unmount) so we don't leak memory.
5856         */
5857        if (!for_reloc && !root_dropped)
5858                btrfs_add_dead_root(root);
5859        return err;
5860}
5861
5862/*
5863 * drop subtree rooted at tree block 'node'.
5864 *
5865 * NOTE: this function will unlock and release tree block 'node'
5866 * only used by relocation code
5867 */
5868int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
5869                        struct btrfs_root *root,
5870                        struct extent_buffer *node,
5871                        struct extent_buffer *parent)
5872{
5873        struct btrfs_fs_info *fs_info = root->fs_info;
5874        struct btrfs_path *path;
5875        struct walk_control *wc;
5876        int level;
5877        int parent_level;
5878        int ret = 0;
5879        int wret;
5880
5881        BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
5882
5883        path = btrfs_alloc_path();
5884        if (!path)
5885                return -ENOMEM;
5886
5887        wc = kzalloc(sizeof(*wc), GFP_NOFS);
5888        if (!wc) {
5889                btrfs_free_path(path);
5890                return -ENOMEM;
5891        }
5892
5893        btrfs_assert_tree_write_locked(parent);
5894        parent_level = btrfs_header_level(parent);
5895        atomic_inc(&parent->refs);
5896        path->nodes[parent_level] = parent;
5897        path->slots[parent_level] = btrfs_header_nritems(parent);
5898
5899        btrfs_assert_tree_write_locked(node);
5900        level = btrfs_header_level(node);
5901        path->nodes[level] = node;
5902        path->slots[level] = 0;
5903        path->locks[level] = BTRFS_WRITE_LOCK;
5904
5905        wc->refs[parent_level] = 1;
5906        wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
5907        wc->level = level;
5908        wc->shared_level = -1;
5909        wc->stage = DROP_REFERENCE;
5910        wc->update_ref = 0;
5911        wc->keep_locks = 1;
5912        wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
5913
5914        while (1) {
5915                wret = walk_down_tree(trans, root, path, wc);
5916                if (wret < 0) {
5917                        ret = wret;
5918                        break;
5919                }
5920
5921                wret = walk_up_tree(trans, root, path, wc, parent_level);
5922                if (wret < 0)
5923                        ret = wret;
5924                if (wret != 0)
5925                        break;
5926        }
5927
5928        kfree(wc);
5929        btrfs_free_path(path);
5930        return ret;
5931}
5932
5933/*
5934 * helper to account the unused space of all the readonly block group in the
5935 * space_info. takes mirrors into account.
5936 */
5937u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
5938{
5939        struct btrfs_block_group *block_group;
5940        u64 free_bytes = 0;
5941        int factor;
5942
5943        /* It's df, we don't care if it's racy */
5944        if (list_empty(&sinfo->ro_bgs))
5945                return 0;
5946
5947        spin_lock(&sinfo->lock);
5948        list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
5949                spin_lock(&block_group->lock);
5950
5951                if (!block_group->ro) {
5952                        spin_unlock(&block_group->lock);
5953                        continue;
5954                }
5955
5956                factor = btrfs_bg_type_to_factor(block_group->flags);
5957                free_bytes += (block_group->length -
5958                               block_group->used) * factor;
5959
5960                spin_unlock(&block_group->lock);
5961        }
5962        spin_unlock(&sinfo->lock);
5963
5964        return free_bytes;
5965}
5966
5967int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
5968                                   u64 start, u64 end)
5969{
5970        return unpin_extent_range(fs_info, start, end, false);
5971}
5972
5973/*
5974 * It used to be that old block groups would be left around forever.
5975 * Iterating over them would be enough to trim unused space.  Since we
5976 * now automatically remove them, we also need to iterate over unallocated
5977 * space.
5978 *
5979 * We don't want a transaction for this since the discard may take a
5980 * substantial amount of time.  We don't require that a transaction be
5981 * running, but we do need to take a running transaction into account
5982 * to ensure that we're not discarding chunks that were released or
5983 * allocated in the current transaction.
5984 *
5985 * Holding the chunks lock will prevent other threads from allocating
5986 * or releasing chunks, but it won't prevent a running transaction
5987 * from committing and releasing the memory that the pending chunks
5988 * list head uses.  For that, we need to take a reference to the
5989 * transaction and hold the commit root sem.  We only need to hold
5990 * it while performing the free space search since we have already
5991 * held back allocations.
5992 */
5993static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
5994{
5995        u64 start = SZ_1M, len = 0, end = 0;
5996        int ret;
5997
5998        *trimmed = 0;
5999
6000        /* Discard not supported = nothing to do. */
6001        if (!bdev_max_discard_sectors(device->bdev))
6002                return 0;
6003
6004        /* Not writable = nothing to do. */
6005        if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
6006                return 0;
6007
6008        /* No free space = nothing to do. */
6009        if (device->total_bytes <= device->bytes_used)
6010                return 0;
6011
6012        ret = 0;
6013
6014        while (1) {
6015                struct btrfs_fs_info *fs_info = device->fs_info;
6016                u64 bytes;
6017
6018                ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
6019                if (ret)
6020                        break;
6021
6022                find_first_clear_extent_bit(&device->alloc_state, start,
6023                                            &start, &end,
6024                                            CHUNK_TRIMMED | CHUNK_ALLOCATED);
6025
6026                /* Check if there are any CHUNK_* bits left */
6027                if (start > device->total_bytes) {
6028                        WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
6029                        btrfs_warn_in_rcu(fs_info,
6030"ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu",
6031                                          start, end - start + 1,
6032                                          rcu_str_deref(device->name),
6033                                          device->total_bytes);
6034                        mutex_unlock(&fs_info->chunk_mutex);
6035                        ret = 0;
6036                        break;
6037                }
6038
6039                /* Ensure we skip the reserved area in the first 1M */
6040                start = max_t(u64, start, SZ_1M);
6041
6042                /*
6043                 * If find_first_clear_extent_bit find a range that spans the
6044                 * end of the device it will set end to -1, in this case it's up
6045                 * to the caller to trim the value to the size of the device.
6046                 */
6047                end = min(end, device->total_bytes - 1);
6048
6049                len = end - start + 1;
6050
6051                /* We didn't find any extents */
6052                if (!len) {
6053                        mutex_unlock(&fs_info->chunk_mutex);
6054                        ret = 0;
6055                        break;
6056                }
6057
6058                ret = btrfs_issue_discard(device->bdev, start, len,
6059                                          &bytes);
6060                if (!ret)
6061                        set_extent_bits(&device->alloc_state, start,
6062                                        start + bytes - 1,
6063                                        CHUNK_TRIMMED);
6064                mutex_unlock(&fs_info->chunk_mutex);
6065
6066                if (ret)
6067                        break;
6068
6069                start += len;
6070                *trimmed += bytes;
6071
6072                if (fatal_signal_pending(current)) {
6073                        ret = -ERESTARTSYS;
6074                        break;
6075                }
6076
6077                cond_resched();
6078        }
6079
6080        return ret;
6081}
6082
6083/*
6084 * Trim the whole filesystem by:
6085 * 1) trimming the free space in each block group
6086 * 2) trimming the unallocated space on each device
6087 *
6088 * This will also continue trimming even if a block group or device encounters
6089 * an error.  The return value will be the last error, or 0 if nothing bad
6090 * happens.
6091 */
6092int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
6093{
6094        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6095        struct btrfs_block_group *cache = NULL;
6096        struct btrfs_device *device;
6097        u64 group_trimmed;
6098        u64 range_end = U64_MAX;
6099        u64 start;
6100        u64 end;
6101        u64 trimmed = 0;
6102        u64 bg_failed = 0;
6103        u64 dev_failed = 0;
6104        int bg_ret = 0;
6105        int dev_ret = 0;
6106        int ret = 0;
6107
6108        if (range->start == U64_MAX)
6109                return -EINVAL;
6110
6111        /*
6112         * Check range overflow if range->len is set.
6113         * The default range->len is U64_MAX.
6114         */
6115        if (range->len != U64_MAX &&
6116            check_add_overflow(range->start, range->len, &range_end))
6117                return -EINVAL;
6118
6119        cache = btrfs_lookup_first_block_group(fs_info, range->start);
6120        for (; cache; cache = btrfs_next_block_group(cache)) {
6121                if (cache->start >= range_end) {
6122                        btrfs_put_block_group(cache);
6123                        break;
6124                }
6125
6126                start = max(range->start, cache->start);
6127                end = min(range_end, cache->start + cache->length);
6128
6129                if (end - start >= range->minlen) {
6130                        if (!btrfs_block_group_done(cache)) {
6131                                ret = btrfs_cache_block_group(cache, 0);
6132                                if (ret) {
6133                                        bg_failed++;
6134                                        bg_ret = ret;
6135                                        continue;
6136                                }
6137                                ret = btrfs_wait_block_group_cache_done(cache);
6138                                if (ret) {
6139                                        bg_failed++;
6140                                        bg_ret = ret;
6141                                        continue;
6142                                }
6143                        }
6144                        ret = btrfs_trim_block_group(cache,
6145                                                     &group_trimmed,
6146                                                     start,
6147                                                     end,
6148                                                     range->minlen);
6149
6150                        trimmed += group_trimmed;
6151                        if (ret) {
6152                                bg_failed++;
6153                                bg_ret = ret;
6154                                continue;
6155                        }
6156                }
6157        }
6158
6159        if (bg_failed)
6160                btrfs_warn(fs_info,
6161                        "failed to trim %llu block group(s), last error %d",
6162                        bg_failed, bg_ret);
6163
6164        mutex_lock(&fs_devices->device_list_mutex);
6165        list_for_each_entry(device, &fs_devices->devices, dev_list) {
6166                if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
6167                        continue;
6168
6169                ret = btrfs_trim_free_extents(device, &group_trimmed);
6170                if (ret) {
6171                        dev_failed++;
6172                        dev_ret = ret;
6173                        break;
6174                }
6175
6176                trimmed += group_trimmed;
6177        }
6178        mutex_unlock(&fs_devices->device_list_mutex);
6179
6180        if (dev_failed)
6181                btrfs_warn(fs_info,
6182                        "failed to trim %llu device(s), last error %d",
6183                        dev_failed, dev_ret);
6184        range->len = trimmed;
6185        if (bg_ret)
6186                return bg_ret;
6187        return dev_ret;
6188}
6189