linux/fs/btrfs/delalloc-space.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2
   3#include "ctree.h"
   4#include "delalloc-space.h"
   5#include "block-rsv.h"
   6#include "btrfs_inode.h"
   7#include "space-info.h"
   8#include "transaction.h"
   9#include "qgroup.h"
  10#include "block-group.h"
  11
  12/*
  13 * HOW DOES THIS WORK
  14 *
  15 * There are two stages to data reservations, one for data and one for metadata
  16 * to handle the new extents and checksums generated by writing data.
  17 *
  18 *
  19 * DATA RESERVATION
  20 *   The general flow of the data reservation is as follows
  21 *
  22 *   -> Reserve
  23 *     We call into btrfs_reserve_data_bytes() for the user request bytes that
  24 *     they wish to write.  We make this reservation and add it to
  25 *     space_info->bytes_may_use.  We set EXTENT_DELALLOC on the inode io_tree
  26 *     for the range and carry on if this is buffered, or follow up trying to
  27 *     make a real allocation if we are pre-allocating or doing O_DIRECT.
  28 *
  29 *   -> Use
  30 *     At writepages()/prealloc/O_DIRECT time we will call into
  31 *     btrfs_reserve_extent() for some part or all of this range of bytes.  We
  32 *     will make the allocation and subtract space_info->bytes_may_use by the
  33 *     original requested length and increase the space_info->bytes_reserved by
  34 *     the allocated length.  This distinction is important because compression
  35 *     may allocate a smaller on disk extent than we previously reserved.
  36 *
  37 *   -> Allocation
  38 *     finish_ordered_io() will insert the new file extent item for this range,
  39 *     and then add a delayed ref update for the extent tree.  Once that delayed
  40 *     ref is written the extent size is subtracted from
  41 *     space_info->bytes_reserved and added to space_info->bytes_used.
  42 *
  43 *   Error handling
  44 *
  45 *   -> By the reservation maker
  46 *     This is the simplest case, we haven't completed our operation and we know
  47 *     how much we reserved, we can simply call
  48 *     btrfs_free_reserved_data_space*() and it will be removed from
  49 *     space_info->bytes_may_use.
  50 *
  51 *   -> After the reservation has been made, but before cow_file_range()
  52 *     This is specifically for the delalloc case.  You must clear
  53 *     EXTENT_DELALLOC with the EXTENT_CLEAR_DATA_RESV bit, and the range will
  54 *     be subtracted from space_info->bytes_may_use.
  55 *
  56 * METADATA RESERVATION
  57 *   The general metadata reservation lifetimes are discussed elsewhere, this
  58 *   will just focus on how it is used for delalloc space.
  59 *
  60 *   We keep track of two things on a per inode bases
  61 *
  62 *   ->outstanding_extents
  63 *     This is the number of file extent items we'll need to handle all of the
  64 *     outstanding DELALLOC space we have in this inode.  We limit the maximum
  65 *     size of an extent, so a large contiguous dirty area may require more than
  66 *     one outstanding_extent, which is why count_max_extents() is used to
  67 *     determine how many outstanding_extents get added.
  68 *
  69 *   ->csum_bytes
  70 *     This is essentially how many dirty bytes we have for this inode, so we
  71 *     can calculate the number of checksum items we would have to add in order
  72 *     to checksum our outstanding data.
  73 *
  74 *   We keep a per-inode block_rsv in order to make it easier to keep track of
  75 *   our reservation.  We use btrfs_calculate_inode_block_rsv_size() to
  76 *   calculate the current theoretical maximum reservation we would need for the
  77 *   metadata for this inode.  We call this and then adjust our reservation as
  78 *   necessary, either by attempting to reserve more space, or freeing up excess
  79 *   space.
  80 *
  81 * OUTSTANDING_EXTENTS HANDLING
  82 *
  83 *  ->outstanding_extents is used for keeping track of how many extents we will
  84 *  need to use for this inode, and it will fluctuate depending on where you are
  85 *  in the life cycle of the dirty data.  Consider the following normal case for
  86 *  a completely clean inode, with a num_bytes < our maximum allowed extent size
  87 *
  88 *  -> reserve
  89 *    ->outstanding_extents += 1 (current value is 1)
  90 *
  91 *  -> set_delalloc
  92 *    ->outstanding_extents += 1 (currrent value is 2)
  93 *
  94 *  -> btrfs_delalloc_release_extents()
  95 *    ->outstanding_extents -= 1 (current value is 1)
  96 *
  97 *    We must call this once we are done, as we hold our reservation for the
  98 *    duration of our operation, and then assume set_delalloc will update the
  99 *    counter appropriately.
 100 *
 101 *  -> add ordered extent
 102 *    ->outstanding_extents += 1 (current value is 2)
 103 *
 104 *  -> btrfs_clear_delalloc_extent
 105 *    ->outstanding_extents -= 1 (current value is 1)
 106 *
 107 *  -> finish_ordered_io/btrfs_remove_ordered_extent
 108 *    ->outstanding_extents -= 1 (current value is 0)
 109 *
 110 *  Each stage is responsible for their own accounting of the extent, thus
 111 *  making error handling and cleanup easier.
 112 */
 113
 114int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
 115{
 116        struct btrfs_root *root = inode->root;
 117        struct btrfs_fs_info *fs_info = root->fs_info;
 118        struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
 119        u64 used;
 120        int ret = 0;
 121        int need_commit = 2;
 122        int have_pinned_space;
 123
 124        /* Make sure bytes are sectorsize aligned */
 125        bytes = ALIGN(bytes, fs_info->sectorsize);
 126
 127        if (btrfs_is_free_space_inode(inode)) {
 128                need_commit = 0;
 129                ASSERT(current->journal_info);
 130        }
 131
 132again:
 133        /* Make sure we have enough space to handle the data first */
 134        spin_lock(&data_sinfo->lock);
 135        used = btrfs_space_info_used(data_sinfo, true);
 136
 137        if (used + bytes > data_sinfo->total_bytes) {
 138                struct btrfs_trans_handle *trans;
 139
 140                /*
 141                 * If we don't have enough free bytes in this space then we need
 142                 * to alloc a new chunk.
 143                 */
 144                if (!data_sinfo->full) {
 145                        u64 alloc_target;
 146
 147                        data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
 148                        spin_unlock(&data_sinfo->lock);
 149
 150                        alloc_target = btrfs_data_alloc_profile(fs_info);
 151                        /*
 152                         * It is ugly that we don't call nolock join
 153                         * transaction for the free space inode case here.
 154                         * But it is safe because we only do the data space
 155                         * reservation for the free space cache in the
 156                         * transaction context, the common join transaction
 157                         * just increase the counter of the current transaction
 158                         * handler, doesn't try to acquire the trans_lock of
 159                         * the fs.
 160                         */
 161                        trans = btrfs_join_transaction(root);
 162                        if (IS_ERR(trans))
 163                                return PTR_ERR(trans);
 164
 165                        ret = btrfs_chunk_alloc(trans, alloc_target,
 166                                                CHUNK_ALLOC_NO_FORCE);
 167                        btrfs_end_transaction(trans);
 168                        if (ret < 0) {
 169                                if (ret != -ENOSPC)
 170                                        return ret;
 171                                else {
 172                                        have_pinned_space = 1;
 173                                        goto commit_trans;
 174                                }
 175                        }
 176
 177                        goto again;
 178                }
 179
 180                /*
 181                 * If we don't have enough pinned space to deal with this
 182                 * allocation, and no removed chunk in current transaction,
 183                 * don't bother committing the transaction.
 184                 */
 185                have_pinned_space = __percpu_counter_compare(
 186                        &data_sinfo->total_bytes_pinned,
 187                        used + bytes - data_sinfo->total_bytes,
 188                        BTRFS_TOTAL_BYTES_PINNED_BATCH);
 189                spin_unlock(&data_sinfo->lock);
 190
 191                /* Commit the current transaction and try again */
 192commit_trans:
 193                if (need_commit) {
 194                        need_commit--;
 195
 196                        if (need_commit > 0) {
 197                                btrfs_start_delalloc_roots(fs_info, -1);
 198                                btrfs_wait_ordered_roots(fs_info, U64_MAX, 0,
 199                                                         (u64)-1);
 200                        }
 201
 202                        trans = btrfs_join_transaction(root);
 203                        if (IS_ERR(trans))
 204                                return PTR_ERR(trans);
 205                        if (have_pinned_space >= 0 ||
 206                            test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
 207                                     &trans->transaction->flags) ||
 208                            need_commit > 0) {
 209                                ret = btrfs_commit_transaction(trans);
 210                                if (ret)
 211                                        return ret;
 212                                /*
 213                                 * The cleaner kthread might still be doing iput
 214                                 * operations. Wait for it to finish so that
 215                                 * more space is released.  We don't need to
 216                                 * explicitly run the delayed iputs here because
 217                                 * the commit_transaction would have woken up
 218                                 * the cleaner.
 219                                 */
 220                                ret = btrfs_wait_on_delayed_iputs(fs_info);
 221                                if (ret)
 222                                        return ret;
 223                                goto again;
 224                        } else {
 225                                btrfs_end_transaction(trans);
 226                        }
 227                }
 228
 229                trace_btrfs_space_reservation(fs_info,
 230                                              "space_info:enospc",
 231                                              data_sinfo->flags, bytes, 1);
 232                return -ENOSPC;
 233        }
 234        btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, bytes);
 235        spin_unlock(&data_sinfo->lock);
 236
 237        return 0;
 238}
 239
 240int btrfs_check_data_free_space(struct btrfs_inode *inode,
 241                        struct extent_changeset **reserved, u64 start, u64 len)
 242{
 243        struct btrfs_fs_info *fs_info = inode->root->fs_info;
 244        int ret;
 245
 246        /* align the range */
 247        len = round_up(start + len, fs_info->sectorsize) -
 248              round_down(start, fs_info->sectorsize);
 249        start = round_down(start, fs_info->sectorsize);
 250
 251        ret = btrfs_alloc_data_chunk_ondemand(inode, len);
 252        if (ret < 0)
 253                return ret;
 254
 255        /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
 256        ret = btrfs_qgroup_reserve_data(inode, reserved, start, len);
 257        if (ret < 0)
 258                btrfs_free_reserved_data_space_noquota(fs_info, len);
 259        else
 260                ret = 0;
 261        return ret;
 262}
 263
 264/*
 265 * Called if we need to clear a data reservation for this inode
 266 * Normally in a error case.
 267 *
 268 * This one will *NOT* use accurate qgroup reserved space API, just for case
 269 * which we can't sleep and is sure it won't affect qgroup reserved space.
 270 * Like clear_bit_hook().
 271 */
 272void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info,
 273                                            u64 len)
 274{
 275        struct btrfs_space_info *data_sinfo;
 276
 277        ASSERT(IS_ALIGNED(len, fs_info->sectorsize));
 278
 279        data_sinfo = fs_info->data_sinfo;
 280        spin_lock(&data_sinfo->lock);
 281        btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, -len);
 282        spin_unlock(&data_sinfo->lock);
 283}
 284
 285/*
 286 * Called if we need to clear a data reservation for this inode
 287 * Normally in a error case.
 288 *
 289 * This one will handle the per-inode data rsv map for accurate reserved
 290 * space framework.
 291 */
 292void btrfs_free_reserved_data_space(struct btrfs_inode *inode,
 293                        struct extent_changeset *reserved, u64 start, u64 len)
 294{
 295        struct btrfs_fs_info *fs_info = inode->root->fs_info;
 296
 297        /* Make sure the range is aligned to sectorsize */
 298        len = round_up(start + len, fs_info->sectorsize) -
 299              round_down(start, fs_info->sectorsize);
 300        start = round_down(start, fs_info->sectorsize);
 301
 302        btrfs_free_reserved_data_space_noquota(fs_info, len);
 303        btrfs_qgroup_free_data(inode, reserved, start, len);
 304}
 305
 306/**
 307 * btrfs_inode_rsv_release - release any excessive reservation.
 308 * @inode - the inode we need to release from.
 309 * @qgroup_free - free or convert qgroup meta.
 310 *   Unlike normal operation, qgroup meta reservation needs to know if we are
 311 *   freeing qgroup reservation or just converting it into per-trans.  Normally
 312 *   @qgroup_free is true for error handling, and false for normal release.
 313 *
 314 * This is the same as btrfs_block_rsv_release, except that it handles the
 315 * tracepoint for the reservation.
 316 */
 317static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
 318{
 319        struct btrfs_fs_info *fs_info = inode->root->fs_info;
 320        struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
 321        u64 released = 0;
 322        u64 qgroup_to_release = 0;
 323
 324        /*
 325         * Since we statically set the block_rsv->size we just want to say we
 326         * are releasing 0 bytes, and then we'll just get the reservation over
 327         * the size free'd.
 328         */
 329        released = btrfs_block_rsv_release(fs_info, block_rsv, 0,
 330                                           &qgroup_to_release);
 331        if (released > 0)
 332                trace_btrfs_space_reservation(fs_info, "delalloc",
 333                                              btrfs_ino(inode), released, 0);
 334        if (qgroup_free)
 335                btrfs_qgroup_free_meta_prealloc(inode->root, qgroup_to_release);
 336        else
 337                btrfs_qgroup_convert_reserved_meta(inode->root,
 338                                                   qgroup_to_release);
 339}
 340
 341static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
 342                                                 struct btrfs_inode *inode)
 343{
 344        struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
 345        u64 reserve_size = 0;
 346        u64 qgroup_rsv_size = 0;
 347        u64 csum_leaves;
 348        unsigned outstanding_extents;
 349
 350        lockdep_assert_held(&inode->lock);
 351        outstanding_extents = inode->outstanding_extents;
 352
 353        /*
 354         * Insert size for the number of outstanding extents, 1 normal size for
 355         * updating the inode.
 356         */
 357        if (outstanding_extents) {
 358                reserve_size = btrfs_calc_insert_metadata_size(fs_info,
 359                                                outstanding_extents);
 360                reserve_size += btrfs_calc_metadata_size(fs_info, 1);
 361        }
 362        csum_leaves = btrfs_csum_bytes_to_leaves(fs_info,
 363                                                 inode->csum_bytes);
 364        reserve_size += btrfs_calc_insert_metadata_size(fs_info,
 365                                                        csum_leaves);
 366        /*
 367         * For qgroup rsv, the calculation is very simple:
 368         * account one nodesize for each outstanding extent
 369         *
 370         * This is overestimating in most cases.
 371         */
 372        qgroup_rsv_size = (u64)outstanding_extents * fs_info->nodesize;
 373
 374        spin_lock(&block_rsv->lock);
 375        block_rsv->size = reserve_size;
 376        block_rsv->qgroup_rsv_size = qgroup_rsv_size;
 377        spin_unlock(&block_rsv->lock);
 378}
 379
 380static void calc_inode_reservations(struct btrfs_fs_info *fs_info,
 381                                    u64 num_bytes, u64 *meta_reserve,
 382                                    u64 *qgroup_reserve)
 383{
 384        u64 nr_extents = count_max_extents(num_bytes);
 385        u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, num_bytes);
 386        u64 inode_update = btrfs_calc_metadata_size(fs_info, 1);
 387
 388        *meta_reserve = btrfs_calc_insert_metadata_size(fs_info,
 389                                                nr_extents + csum_leaves);
 390
 391        /*
 392         * finish_ordered_io has to update the inode, so add the space required
 393         * for an inode update.
 394         */
 395        *meta_reserve += inode_update;
 396        *qgroup_reserve = nr_extents * fs_info->nodesize;
 397}
 398
 399int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
 400{
 401        struct btrfs_root *root = inode->root;
 402        struct btrfs_fs_info *fs_info = root->fs_info;
 403        struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
 404        u64 meta_reserve, qgroup_reserve;
 405        unsigned nr_extents;
 406        enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
 407        int ret = 0;
 408
 409        /*
 410         * If we are a free space inode we need to not flush since we will be in
 411         * the middle of a transaction commit.  We also don't need the delalloc
 412         * mutex since we won't race with anybody.  We need this mostly to make
 413         * lockdep shut its filthy mouth.
 414         *
 415         * If we have a transaction open (can happen if we call truncate_block
 416         * from truncate), then we need FLUSH_LIMIT so we don't deadlock.
 417         */
 418        if (btrfs_is_free_space_inode(inode)) {
 419                flush = BTRFS_RESERVE_NO_FLUSH;
 420        } else {
 421                if (current->journal_info)
 422                        flush = BTRFS_RESERVE_FLUSH_LIMIT;
 423
 424                if (btrfs_transaction_in_commit(fs_info))
 425                        schedule_timeout(1);
 426        }
 427
 428        num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
 429
 430        /*
 431         * We always want to do it this way, every other way is wrong and ends
 432         * in tears.  Pre-reserving the amount we are going to add will always
 433         * be the right way, because otherwise if we have enough parallelism we
 434         * could end up with thousands of inodes all holding little bits of
 435         * reservations they were able to make previously and the only way to
 436         * reclaim that space is to ENOSPC out the operations and clear
 437         * everything out and try again, which is bad.  This way we just
 438         * over-reserve slightly, and clean up the mess when we are done.
 439         */
 440        calc_inode_reservations(fs_info, num_bytes, &meta_reserve,
 441                                &qgroup_reserve);
 442        ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true);
 443        if (ret)
 444                return ret;
 445        ret = btrfs_reserve_metadata_bytes(root, block_rsv, meta_reserve, flush);
 446        if (ret) {
 447                btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve);
 448                return ret;
 449        }
 450
 451        /*
 452         * Now we need to update our outstanding extents and csum bytes _first_
 453         * and then add the reservation to the block_rsv.  This keeps us from
 454         * racing with an ordered completion or some such that would think it
 455         * needs to free the reservation we just made.
 456         */
 457        spin_lock(&inode->lock);
 458        nr_extents = count_max_extents(num_bytes);
 459        btrfs_mod_outstanding_extents(inode, nr_extents);
 460        inode->csum_bytes += num_bytes;
 461        btrfs_calculate_inode_block_rsv_size(fs_info, inode);
 462        spin_unlock(&inode->lock);
 463
 464        /* Now we can safely add our space to our block rsv */
 465        btrfs_block_rsv_add_bytes(block_rsv, meta_reserve, false);
 466        trace_btrfs_space_reservation(root->fs_info, "delalloc",
 467                                      btrfs_ino(inode), meta_reserve, 1);
 468
 469        spin_lock(&block_rsv->lock);
 470        block_rsv->qgroup_rsv_reserved += qgroup_reserve;
 471        spin_unlock(&block_rsv->lock);
 472
 473        return 0;
 474}
 475
 476/**
 477 * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
 478 * @inode: the inode to release the reservation for.
 479 * @num_bytes: the number of bytes we are releasing.
 480 * @qgroup_free: free qgroup reservation or convert it to per-trans reservation
 481 *
 482 * This will release the metadata reservation for an inode.  This can be called
 483 * once we complete IO for a given set of bytes to release their metadata
 484 * reservations, or on error for the same reason.
 485 */
 486void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
 487                                     bool qgroup_free)
 488{
 489        struct btrfs_fs_info *fs_info = inode->root->fs_info;
 490
 491        num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
 492        spin_lock(&inode->lock);
 493        inode->csum_bytes -= num_bytes;
 494        btrfs_calculate_inode_block_rsv_size(fs_info, inode);
 495        spin_unlock(&inode->lock);
 496
 497        if (btrfs_is_testing(fs_info))
 498                return;
 499
 500        btrfs_inode_rsv_release(inode, qgroup_free);
 501}
 502
 503/**
 504 * btrfs_delalloc_release_extents - release our outstanding_extents
 505 * @inode: the inode to balance the reservation for.
 506 * @num_bytes: the number of bytes we originally reserved with
 507 *
 508 * When we reserve space we increase outstanding_extents for the extents we may
 509 * add.  Once we've set the range as delalloc or created our ordered extents we
 510 * have outstanding_extents to track the real usage, so we use this to free our
 511 * temporarily tracked outstanding_extents.  This _must_ be used in conjunction
 512 * with btrfs_delalloc_reserve_metadata.
 513 */
 514void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
 515{
 516        struct btrfs_fs_info *fs_info = inode->root->fs_info;
 517        unsigned num_extents;
 518
 519        spin_lock(&inode->lock);
 520        num_extents = count_max_extents(num_bytes);
 521        btrfs_mod_outstanding_extents(inode, -num_extents);
 522        btrfs_calculate_inode_block_rsv_size(fs_info, inode);
 523        spin_unlock(&inode->lock);
 524
 525        if (btrfs_is_testing(fs_info))
 526                return;
 527
 528        btrfs_inode_rsv_release(inode, true);
 529}
 530
 531/**
 532 * btrfs_delalloc_reserve_space - reserve data and metadata space for
 533 * delalloc
 534 * @inode: inode we're writing to
 535 * @start: start range we are writing to
 536 * @len: how long the range we are writing to
 537 * @reserved: mandatory parameter, record actually reserved qgroup ranges of
 538 *            current reservation.
 539 *
 540 * This will do the following things
 541 *
 542 * - reserve space in data space info for num bytes
 543 *   and reserve precious corresponding qgroup space
 544 *   (Done in check_data_free_space)
 545 *
 546 * - reserve space for metadata space, based on the number of outstanding
 547 *   extents and how much csums will be needed
 548 *   also reserve metadata space in a per root over-reserve method.
 549 * - add to the inodes->delalloc_bytes
 550 * - add it to the fs_info's delalloc inodes list.
 551 *   (Above 3 all done in delalloc_reserve_metadata)
 552 *
 553 * Return 0 for success
 554 * Return <0 for error(-ENOSPC or -EQUOT)
 555 */
 556int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
 557                        struct extent_changeset **reserved, u64 start, u64 len)
 558{
 559        int ret;
 560
 561        ret = btrfs_check_data_free_space(inode, reserved, start, len);
 562        if (ret < 0)
 563                return ret;
 564        ret = btrfs_delalloc_reserve_metadata(inode, len);
 565        if (ret < 0)
 566                btrfs_free_reserved_data_space(inode, *reserved, start, len);
 567        return ret;
 568}
 569
 570/**
 571 * btrfs_delalloc_release_space - release data and metadata space for delalloc
 572 * @inode: inode we're releasing space for
 573 * @start: start position of the space already reserved
 574 * @len: the len of the space already reserved
 575 * @release_bytes: the len of the space we consumed or didn't use
 576 *
 577 * This function will release the metadata space that was not used and will
 578 * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
 579 * list if there are no delalloc bytes left.
 580 * Also it will handle the qgroup reserved space.
 581 */
 582void btrfs_delalloc_release_space(struct btrfs_inode *inode,
 583                                  struct extent_changeset *reserved,
 584                                  u64 start, u64 len, bool qgroup_free)
 585{
 586        btrfs_delalloc_release_metadata(inode, len, qgroup_free);
 587        btrfs_free_reserved_data_space(inode, reserved, start, len);
 588}
 589