linux/fs/btrfs/block-rsv.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2
   3#include "misc.h"
   4#include "ctree.h"
   5#include "block-rsv.h"
   6#include "space-info.h"
   7#include "transaction.h"
   8#include "block-group.h"
   9
  10/*
  11 * HOW DO BLOCK RESERVES WORK
  12 *
  13 *   Think of block_rsv's as buckets for logically grouped metadata
  14 *   reservations.  Each block_rsv has a ->size and a ->reserved.  ->size is
  15 *   how large we want our block rsv to be, ->reserved is how much space is
  16 *   currently reserved for this block reserve.
  17 *
  18 *   ->failfast exists for the truncate case, and is described below.
  19 *
  20 * NORMAL OPERATION
  21 *
  22 *   -> Reserve
  23 *     Entrance: btrfs_block_rsv_add, btrfs_block_rsv_refill
  24 *
  25 *     We call into btrfs_reserve_metadata_bytes() with our bytes, which is
  26 *     accounted for in space_info->bytes_may_use, and then add the bytes to
  27 *     ->reserved, and ->size in the case of btrfs_block_rsv_add.
  28 *
  29 *     ->size is an over-estimation of how much we may use for a particular
  30 *     operation.
  31 *
  32 *   -> Use
  33 *     Entrance: btrfs_use_block_rsv
  34 *
  35 *     When we do a btrfs_alloc_tree_block() we call into btrfs_use_block_rsv()
  36 *     to determine the appropriate block_rsv to use, and then verify that
  37 *     ->reserved has enough space for our tree block allocation.  Once
  38 *     successful we subtract fs_info->nodesize from ->reserved.
  39 *
  40 *   -> Finish
  41 *     Entrance: btrfs_block_rsv_release
  42 *
  43 *     We are finished with our operation, subtract our individual reservation
  44 *     from ->size, and then subtract ->size from ->reserved and free up the
  45 *     excess if there is any.
  46 *
  47 *     There is some logic here to refill the delayed refs rsv or the global rsv
  48 *     as needed, otherwise the excess is subtracted from
  49 *     space_info->bytes_may_use.
  50 *
  51 * TYPES OF BLOCK RESERVES
  52 *
  53 * BLOCK_RSV_TRANS, BLOCK_RSV_DELOPS, BLOCK_RSV_CHUNK
  54 *   These behave normally, as described above, just within the confines of the
  55 *   lifetime of their particular operation (transaction for the whole trans
  56 *   handle lifetime, for example).
  57 *
  58 * BLOCK_RSV_GLOBAL
  59 *   It is impossible to properly account for all the space that may be required
  60 *   to make our extent tree updates.  This block reserve acts as an overflow
  61 *   buffer in case our delayed refs reserve does not reserve enough space to
  62 *   update the extent tree.
  63 *
  64 *   We can steal from this in some cases as well, notably on evict() or
  65 *   truncate() in order to help users recover from ENOSPC conditions.
  66 *
  67 * BLOCK_RSV_DELALLOC
  68 *   The individual item sizes are determined by the per-inode size
  69 *   calculations, which are described with the delalloc code.  This is pretty
  70 *   straightforward, it's just the calculation of ->size encodes a lot of
  71 *   different items, and thus it gets used when updating inodes, inserting file
  72 *   extents, and inserting checksums.
  73 *
  74 * BLOCK_RSV_DELREFS
  75 *   We keep a running tally of how many delayed refs we have on the system.
  76 *   We assume each one of these delayed refs are going to use a full
  77 *   reservation.  We use the transaction items and pre-reserve space for every
  78 *   operation, and use this reservation to refill any gap between ->size and
  79 *   ->reserved that may exist.
  80 *
  81 *   From there it's straightforward, removing a delayed ref means we remove its
  82 *   count from ->size and free up reservations as necessary.  Since this is
  83 *   the most dynamic block reserve in the system, we will try to refill this
  84 *   block reserve first with any excess returned by any other block reserve.
  85 *
  86 * BLOCK_RSV_EMPTY
  87 *   This is the fallback block reserve to make us try to reserve space if we
  88 *   don't have a specific bucket for this allocation.  It is mostly used for
  89 *   updating the device tree and such, since that is a separate pool we're
  90 *   content to just reserve space from the space_info on demand.
  91 *
  92 * BLOCK_RSV_TEMP
  93 *   This is used by things like truncate and iput.  We will temporarily
  94 *   allocate a block reserve, set it to some size, and then truncate bytes
  95 *   until we have no space left.  With ->failfast set we'll simply return
  96 *   ENOSPC from btrfs_use_block_rsv() to signal that we need to unwind and try
  97 *   to make a new reservation.  This is because these operations are
  98 *   unbounded, so we want to do as much work as we can, and then back off and
  99 *   re-reserve.
 100 */
 101
 102static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
 103                                    struct btrfs_block_rsv *block_rsv,
 104                                    struct btrfs_block_rsv *dest, u64 num_bytes,
 105                                    u64 *qgroup_to_release_ret)
 106{
 107        struct btrfs_space_info *space_info = block_rsv->space_info;
 108        u64 qgroup_to_release = 0;
 109        u64 ret;
 110
 111        spin_lock(&block_rsv->lock);
 112        if (num_bytes == (u64)-1) {
 113                num_bytes = block_rsv->size;
 114                qgroup_to_release = block_rsv->qgroup_rsv_size;
 115        }
 116        block_rsv->size -= num_bytes;
 117        if (block_rsv->reserved >= block_rsv->size) {
 118                num_bytes = block_rsv->reserved - block_rsv->size;
 119                block_rsv->reserved = block_rsv->size;
 120                block_rsv->full = 1;
 121        } else {
 122                num_bytes = 0;
 123        }
 124        if (block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) {
 125                qgroup_to_release = block_rsv->qgroup_rsv_reserved -
 126                                    block_rsv->qgroup_rsv_size;
 127                block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size;
 128        } else {
 129                qgroup_to_release = 0;
 130        }
 131        spin_unlock(&block_rsv->lock);
 132
 133        ret = num_bytes;
 134        if (num_bytes > 0) {
 135                if (dest) {
 136                        spin_lock(&dest->lock);
 137                        if (!dest->full) {
 138                                u64 bytes_to_add;
 139
 140                                bytes_to_add = dest->size - dest->reserved;
 141                                bytes_to_add = min(num_bytes, bytes_to_add);
 142                                dest->reserved += bytes_to_add;
 143                                if (dest->reserved >= dest->size)
 144                                        dest->full = 1;
 145                                num_bytes -= bytes_to_add;
 146                        }
 147                        spin_unlock(&dest->lock);
 148                }
 149                if (num_bytes)
 150                        btrfs_space_info_free_bytes_may_use(fs_info,
 151                                                            space_info,
 152                                                            num_bytes);
 153        }
 154        if (qgroup_to_release_ret)
 155                *qgroup_to_release_ret = qgroup_to_release;
 156        return ret;
 157}
 158
 159int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
 160                            struct btrfs_block_rsv *dst, u64 num_bytes,
 161                            bool update_size)
 162{
 163        int ret;
 164
 165        ret = btrfs_block_rsv_use_bytes(src, num_bytes);
 166        if (ret)
 167                return ret;
 168
 169        btrfs_block_rsv_add_bytes(dst, num_bytes, update_size);
 170        return 0;
 171}
 172
 173void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
 174{
 175        memset(rsv, 0, sizeof(*rsv));
 176        spin_lock_init(&rsv->lock);
 177        rsv->type = type;
 178}
 179
 180void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
 181                                   struct btrfs_block_rsv *rsv,
 182                                   unsigned short type)
 183{
 184        btrfs_init_block_rsv(rsv, type);
 185        rsv->space_info = btrfs_find_space_info(fs_info,
 186                                            BTRFS_BLOCK_GROUP_METADATA);
 187}
 188
 189struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
 190                                              unsigned short type)
 191{
 192        struct btrfs_block_rsv *block_rsv;
 193
 194        block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
 195        if (!block_rsv)
 196                return NULL;
 197
 198        btrfs_init_metadata_block_rsv(fs_info, block_rsv, type);
 199        return block_rsv;
 200}
 201
 202void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
 203                          struct btrfs_block_rsv *rsv)
 204{
 205        if (!rsv)
 206                return;
 207        btrfs_block_rsv_release(fs_info, rsv, (u64)-1, NULL);
 208        kfree(rsv);
 209}
 210
 211int btrfs_block_rsv_add(struct btrfs_root *root,
 212                        struct btrfs_block_rsv *block_rsv, u64 num_bytes,
 213                        enum btrfs_reserve_flush_enum flush)
 214{
 215        int ret;
 216
 217        if (num_bytes == 0)
 218                return 0;
 219
 220        ret = btrfs_reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
 221        if (!ret)
 222                btrfs_block_rsv_add_bytes(block_rsv, num_bytes, true);
 223
 224        return ret;
 225}
 226
 227int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor)
 228{
 229        u64 num_bytes = 0;
 230        int ret = -ENOSPC;
 231
 232        if (!block_rsv)
 233                return 0;
 234
 235        spin_lock(&block_rsv->lock);
 236        num_bytes = div_factor(block_rsv->size, min_factor);
 237        if (block_rsv->reserved >= num_bytes)
 238                ret = 0;
 239        spin_unlock(&block_rsv->lock);
 240
 241        return ret;
 242}
 243
 244int btrfs_block_rsv_refill(struct btrfs_root *root,
 245                           struct btrfs_block_rsv *block_rsv, u64 min_reserved,
 246                           enum btrfs_reserve_flush_enum flush)
 247{
 248        u64 num_bytes = 0;
 249        int ret = -ENOSPC;
 250
 251        if (!block_rsv)
 252                return 0;
 253
 254        spin_lock(&block_rsv->lock);
 255        num_bytes = min_reserved;
 256        if (block_rsv->reserved >= num_bytes)
 257                ret = 0;
 258        else
 259                num_bytes -= block_rsv->reserved;
 260        spin_unlock(&block_rsv->lock);
 261
 262        if (!ret)
 263                return 0;
 264
 265        ret = btrfs_reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
 266        if (!ret) {
 267                btrfs_block_rsv_add_bytes(block_rsv, num_bytes, false);
 268                return 0;
 269        }
 270
 271        return ret;
 272}
 273
 274u64 btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
 275                            struct btrfs_block_rsv *block_rsv, u64 num_bytes,
 276                            u64 *qgroup_to_release)
 277{
 278        struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
 279        struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
 280        struct btrfs_block_rsv *target = NULL;
 281
 282        /*
 283         * If we are the delayed_rsv then push to the global rsv, otherwise dump
 284         * into the delayed rsv if it is not full.
 285         */
 286        if (block_rsv == delayed_rsv)
 287                target = global_rsv;
 288        else if (block_rsv != global_rsv && !delayed_rsv->full)
 289                target = delayed_rsv;
 290
 291        if (target && block_rsv->space_info != target->space_info)
 292                target = NULL;
 293
 294        return block_rsv_release_bytes(fs_info, block_rsv, target, num_bytes,
 295                                       qgroup_to_release);
 296}
 297
 298int btrfs_block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes)
 299{
 300        int ret = -ENOSPC;
 301
 302        spin_lock(&block_rsv->lock);
 303        if (block_rsv->reserved >= num_bytes) {
 304                block_rsv->reserved -= num_bytes;
 305                if (block_rsv->reserved < block_rsv->size)
 306                        block_rsv->full = 0;
 307                ret = 0;
 308        }
 309        spin_unlock(&block_rsv->lock);
 310        return ret;
 311}
 312
 313void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
 314                               u64 num_bytes, bool update_size)
 315{
 316        spin_lock(&block_rsv->lock);
 317        block_rsv->reserved += num_bytes;
 318        if (update_size)
 319                block_rsv->size += num_bytes;
 320        else if (block_rsv->reserved >= block_rsv->size)
 321                block_rsv->full = 1;
 322        spin_unlock(&block_rsv->lock);
 323}
 324
 325int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
 326                             struct btrfs_block_rsv *dest, u64 num_bytes,
 327                             int min_factor)
 328{
 329        struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
 330        u64 min_bytes;
 331
 332        if (global_rsv->space_info != dest->space_info)
 333                return -ENOSPC;
 334
 335        spin_lock(&global_rsv->lock);
 336        min_bytes = div_factor(global_rsv->size, min_factor);
 337        if (global_rsv->reserved < min_bytes + num_bytes) {
 338                spin_unlock(&global_rsv->lock);
 339                return -ENOSPC;
 340        }
 341        global_rsv->reserved -= num_bytes;
 342        if (global_rsv->reserved < global_rsv->size)
 343                global_rsv->full = 0;
 344        spin_unlock(&global_rsv->lock);
 345
 346        btrfs_block_rsv_add_bytes(dest, num_bytes, true);
 347        return 0;
 348}
 349
 350void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info)
 351{
 352        struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
 353        struct btrfs_space_info *sinfo = block_rsv->space_info;
 354        u64 num_bytes;
 355        unsigned min_items;
 356
 357        /*
 358         * The global block rsv is based on the size of the extent tree, the
 359         * checksum tree and the root tree.  If the fs is empty we want to set
 360         * it to a minimal amount for safety.
 361         */
 362        num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) +
 363                btrfs_root_used(&fs_info->csum_root->root_item) +
 364                btrfs_root_used(&fs_info->tree_root->root_item);
 365
 366        /*
 367         * We at a minimum are going to modify the csum root, the tree root, and
 368         * the extent root.
 369         */
 370        min_items = 3;
 371
 372        /*
 373         * But we also want to reserve enough space so we can do the fallback
 374         * global reserve for an unlink, which is an additional 5 items (see the
 375         * comment in __unlink_start_trans for what we're modifying.)
 376         *
 377         * But we also need space for the delayed ref updates from the unlink,
 378         * so its 10, 5 for the actual operation, and 5 for the delayed ref
 379         * updates.
 380         */
 381        min_items += 10;
 382
 383        num_bytes = max_t(u64, num_bytes,
 384                          btrfs_calc_insert_metadata_size(fs_info, min_items));
 385
 386        spin_lock(&sinfo->lock);
 387        spin_lock(&block_rsv->lock);
 388
 389        block_rsv->size = min_t(u64, num_bytes, SZ_512M);
 390
 391        if (block_rsv->reserved < block_rsv->size) {
 392                num_bytes = block_rsv->size - block_rsv->reserved;
 393                btrfs_space_info_update_bytes_may_use(fs_info, sinfo,
 394                                                      num_bytes);
 395                block_rsv->reserved = block_rsv->size;
 396        } else if (block_rsv->reserved > block_rsv->size) {
 397                num_bytes = block_rsv->reserved - block_rsv->size;
 398                btrfs_space_info_update_bytes_may_use(fs_info, sinfo,
 399                                                      -num_bytes);
 400                block_rsv->reserved = block_rsv->size;
 401                btrfs_try_granting_tickets(fs_info, sinfo);
 402        }
 403
 404        if (block_rsv->reserved == block_rsv->size)
 405                block_rsv->full = 1;
 406        else
 407                block_rsv->full = 0;
 408
 409        if (block_rsv->size >= sinfo->total_bytes)
 410                sinfo->force_alloc = CHUNK_ALLOC_FORCE;
 411        spin_unlock(&block_rsv->lock);
 412        spin_unlock(&sinfo->lock);
 413}
 414
 415void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info)
 416{
 417        struct btrfs_space_info *space_info;
 418
 419        space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
 420        fs_info->chunk_block_rsv.space_info = space_info;
 421
 422        space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
 423        fs_info->global_block_rsv.space_info = space_info;
 424        fs_info->trans_block_rsv.space_info = space_info;
 425        fs_info->empty_block_rsv.space_info = space_info;
 426        fs_info->delayed_block_rsv.space_info = space_info;
 427        fs_info->delayed_refs_rsv.space_info = space_info;
 428
 429        /*
 430         * Our various recovery options can leave us with NULL roots, so check
 431         * here and just bail before we go dereferencing NULLs everywhere.
 432         */
 433        if (!fs_info->extent_root || !fs_info->csum_root ||
 434            !fs_info->dev_root || !fs_info->chunk_root || !fs_info->tree_root)
 435                return;
 436
 437        fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv;
 438        fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv;
 439        fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
 440        fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
 441        if (fs_info->quota_root)
 442                fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
 443        fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
 444
 445        btrfs_update_global_block_rsv(fs_info);
 446}
 447
 448void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info)
 449{
 450        btrfs_block_rsv_release(fs_info, &fs_info->global_block_rsv, (u64)-1,
 451                                NULL);
 452        WARN_ON(fs_info->trans_block_rsv.size > 0);
 453        WARN_ON(fs_info->trans_block_rsv.reserved > 0);
 454        WARN_ON(fs_info->chunk_block_rsv.size > 0);
 455        WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
 456        WARN_ON(fs_info->delayed_block_rsv.size > 0);
 457        WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
 458        WARN_ON(fs_info->delayed_refs_rsv.reserved > 0);
 459        WARN_ON(fs_info->delayed_refs_rsv.size > 0);
 460}
 461
 462static struct btrfs_block_rsv *get_block_rsv(
 463                                        const struct btrfs_trans_handle *trans,
 464                                        const struct btrfs_root *root)
 465{
 466        struct btrfs_fs_info *fs_info = root->fs_info;
 467        struct btrfs_block_rsv *block_rsv = NULL;
 468
 469        if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) ||
 470            (root == fs_info->csum_root && trans->adding_csums) ||
 471            (root == fs_info->uuid_root))
 472                block_rsv = trans->block_rsv;
 473
 474        if (!block_rsv)
 475                block_rsv = root->block_rsv;
 476
 477        if (!block_rsv)
 478                block_rsv = &fs_info->empty_block_rsv;
 479
 480        return block_rsv;
 481}
 482
 483struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans,
 484                                            struct btrfs_root *root,
 485                                            u32 blocksize)
 486{
 487        struct btrfs_fs_info *fs_info = root->fs_info;
 488        struct btrfs_block_rsv *block_rsv;
 489        struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
 490        int ret;
 491        bool global_updated = false;
 492
 493        block_rsv = get_block_rsv(trans, root);
 494
 495        if (unlikely(block_rsv->size == 0))
 496                goto try_reserve;
 497again:
 498        ret = btrfs_block_rsv_use_bytes(block_rsv, blocksize);
 499        if (!ret)
 500                return block_rsv;
 501
 502        if (block_rsv->failfast)
 503                return ERR_PTR(ret);
 504
 505        if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
 506                global_updated = true;
 507                btrfs_update_global_block_rsv(fs_info);
 508                goto again;
 509        }
 510
 511        /*
 512         * The global reserve still exists to save us from ourselves, so don't
 513         * warn_on if we are short on our delayed refs reserve.
 514         */
 515        if (block_rsv->type != BTRFS_BLOCK_RSV_DELREFS &&
 516            btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
 517                static DEFINE_RATELIMIT_STATE(_rs,
 518                                DEFAULT_RATELIMIT_INTERVAL * 10,
 519                                /*DEFAULT_RATELIMIT_BURST*/ 1);
 520                if (__ratelimit(&_rs))
 521                        WARN(1, KERN_DEBUG
 522                                "BTRFS: block rsv %d returned %d\n",
 523                                block_rsv->type, ret);
 524        }
 525try_reserve:
 526        ret = btrfs_reserve_metadata_bytes(root, block_rsv, blocksize,
 527                                           BTRFS_RESERVE_NO_FLUSH);
 528        if (!ret)
 529                return block_rsv;
 530        /*
 531         * If we couldn't reserve metadata bytes try and use some from
 532         * the global reserve if its space type is the same as the global
 533         * reservation.
 534         */
 535        if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL &&
 536            block_rsv->space_info == global_rsv->space_info) {
 537                ret = btrfs_block_rsv_use_bytes(global_rsv, blocksize);
 538                if (!ret)
 539                        return global_rsv;
 540        }
 541        return ERR_PTR(ret);
 542}
 543