linux/fs/xfs/libxfs/xfs_ag_resv.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0+
   2/*
   3 * Copyright (C) 2016 Oracle.  All Rights Reserved.
   4 * Author: Darrick J. Wong <darrick.wong@oracle.com>
   5 */
   6#include "xfs.h"
   7#include "xfs_fs.h"
   8#include "xfs_shared.h"
   9#include "xfs_format.h"
  10#include "xfs_log_format.h"
  11#include "xfs_trans_resv.h"
  12#include "xfs_mount.h"
  13#include "xfs_alloc.h"
  14#include "xfs_errortag.h"
  15#include "xfs_error.h"
  16#include "xfs_trace.h"
  17#include "xfs_trans.h"
  18#include "xfs_rmap_btree.h"
  19#include "xfs_btree.h"
  20#include "xfs_refcount_btree.h"
  21#include "xfs_ialloc_btree.h"
  22#include "xfs_ag.h"
  23#include "xfs_ag_resv.h"
  24
  25/*
  26 * Per-AG Block Reservations
  27 *
  28 * For some kinds of allocation group metadata structures, it is advantageous
  29 * to reserve a small number of blocks in each AG so that future expansions of
  30 * that data structure do not encounter ENOSPC because errors during a btree
  31 * split cause the filesystem to go offline.
  32 *
  33 * Prior to the introduction of reflink, this wasn't an issue because the free
  34 * space btrees maintain a reserve of space (the AGFL) to handle any expansion
  35 * that may be necessary; and allocations of other metadata (inodes, BMBT,
  36 * dir/attr) aren't restricted to a single AG.  However, with reflink it is
  37 * possible to allocate all the space in an AG, have subsequent reflink/CoW
  38 * activity expand the refcount btree, and discover that there's no space left
  39 * to handle that expansion.  Since we can calculate the maximum size of the
  40 * refcount btree, we can reserve space for it and avoid ENOSPC.
  41 *
  42 * Handling per-AG reservations consists of three changes to the allocator's
  43 * behavior:  First, because these reservations are always needed, we decrease
  44 * the ag_max_usable counter to reflect the size of the AG after the reserved
  45 * blocks are taken.  Second, the reservations must be reflected in the
  46 * fdblocks count to maintain proper accounting.  Third, each AG must maintain
  47 * its own reserved block counter so that we can calculate the amount of space
  48 * that must remain free to maintain the reservations.  Fourth, the "remaining
  49 * reserved blocks" count must be used when calculating the length of the
  50 * longest free extent in an AG and to clamp maxlen in the per-AG allocation
  51 * functions.  In other words, we maintain a virtual allocation via in-core
  52 * accounting tricks so that we don't have to clean up after a crash. :)
  53 *
  54 * Reserved blocks can be managed by passing one of the enum xfs_ag_resv_type
  55 * values via struct xfs_alloc_arg or directly to the xfs_free_extent
  56 * function.  It might seem a little funny to maintain a reservoir of blocks
  57 * to feed another reservoir, but the AGFL only holds enough blocks to get
  58 * through the next transaction.  The per-AG reservation is to ensure (we
  59 * hope) that each AG never runs out of blocks.  Each data structure wanting
  60 * to use the reservation system should update ask/used in xfs_ag_resv_init.
  61 */
  62
  63/*
  64 * Are we critically low on blocks?  For now we'll define that as the number
  65 * of blocks we can get our hands on being less than 10% of what we reserved
  66 * or less than some arbitrary number (maximum btree height).
  67 */
  68bool
  69xfs_ag_resv_critical(
  70        struct xfs_perag                *pag,
  71        enum xfs_ag_resv_type           type)
  72{
  73        xfs_extlen_t                    avail;
  74        xfs_extlen_t                    orig;
  75
  76        switch (type) {
  77        case XFS_AG_RESV_METADATA:
  78                avail = pag->pagf_freeblks - pag->pag_rmapbt_resv.ar_reserved;
  79                orig = pag->pag_meta_resv.ar_asked;
  80                break;
  81        case XFS_AG_RESV_RMAPBT:
  82                avail = pag->pagf_freeblks + pag->pagf_flcount -
  83                        pag->pag_meta_resv.ar_reserved;
  84                orig = pag->pag_rmapbt_resv.ar_asked;
  85                break;
  86        default:
  87                ASSERT(0);
  88                return false;
  89        }
  90
  91        trace_xfs_ag_resv_critical(pag, type, avail);
  92
  93        /* Critically low if less than 10% or max btree height remains. */
  94        return XFS_TEST_ERROR(avail < orig / 10 ||
  95                              avail < pag->pag_mount->m_agbtree_maxlevels,
  96                        pag->pag_mount, XFS_ERRTAG_AG_RESV_CRITICAL);
  97}
  98
  99/*
 100 * How many blocks are reserved but not used, and therefore must not be
 101 * allocated away?
 102 */
 103xfs_extlen_t
 104xfs_ag_resv_needed(
 105        struct xfs_perag                *pag,
 106        enum xfs_ag_resv_type           type)
 107{
 108        xfs_extlen_t                    len;
 109
 110        len = pag->pag_meta_resv.ar_reserved + pag->pag_rmapbt_resv.ar_reserved;
 111        switch (type) {
 112        case XFS_AG_RESV_METADATA:
 113        case XFS_AG_RESV_RMAPBT:
 114                len -= xfs_perag_resv(pag, type)->ar_reserved;
 115                break;
 116        case XFS_AG_RESV_NONE:
 117                /* empty */
 118                break;
 119        default:
 120                ASSERT(0);
 121        }
 122
 123        trace_xfs_ag_resv_needed(pag, type, len);
 124
 125        return len;
 126}
 127
 128/* Clean out a reservation */
 129static int
 130__xfs_ag_resv_free(
 131        struct xfs_perag                *pag,
 132        enum xfs_ag_resv_type           type)
 133{
 134        struct xfs_ag_resv              *resv;
 135        xfs_extlen_t                    oldresv;
 136        int                             error;
 137
 138        trace_xfs_ag_resv_free(pag, type, 0);
 139
 140        resv = xfs_perag_resv(pag, type);
 141        if (pag->pag_agno == 0)
 142                pag->pag_mount->m_ag_max_usable += resv->ar_asked;
 143        /*
 144         * RMAPBT blocks come from the AGFL and AGFL blocks are always
 145         * considered "free", so whatever was reserved at mount time must be
 146         * given back at umount.
 147         */
 148        if (type == XFS_AG_RESV_RMAPBT)
 149                oldresv = resv->ar_orig_reserved;
 150        else
 151                oldresv = resv->ar_reserved;
 152        error = xfs_mod_fdblocks(pag->pag_mount, oldresv, true);
 153        resv->ar_reserved = 0;
 154        resv->ar_asked = 0;
 155        resv->ar_orig_reserved = 0;
 156
 157        if (error)
 158                trace_xfs_ag_resv_free_error(pag->pag_mount, pag->pag_agno,
 159                                error, _RET_IP_);
 160        return error;
 161}
 162
 163/* Free a per-AG reservation. */
 164int
 165xfs_ag_resv_free(
 166        struct xfs_perag                *pag)
 167{
 168        int                             error;
 169        int                             err2;
 170
 171        error = __xfs_ag_resv_free(pag, XFS_AG_RESV_RMAPBT);
 172        err2 = __xfs_ag_resv_free(pag, XFS_AG_RESV_METADATA);
 173        if (err2 && !error)
 174                error = err2;
 175        return error;
 176}
 177
 178static int
 179__xfs_ag_resv_init(
 180        struct xfs_perag                *pag,
 181        enum xfs_ag_resv_type           type,
 182        xfs_extlen_t                    ask,
 183        xfs_extlen_t                    used)
 184{
 185        struct xfs_mount                *mp = pag->pag_mount;
 186        struct xfs_ag_resv              *resv;
 187        int                             error;
 188        xfs_extlen_t                    hidden_space;
 189
 190        if (used > ask)
 191                ask = used;
 192
 193        switch (type) {
 194        case XFS_AG_RESV_RMAPBT:
 195                /*
 196                 * Space taken by the rmapbt is not subtracted from fdblocks
 197                 * because the rmapbt lives in the free space.  Here we must
 198                 * subtract the entire reservation from fdblocks so that we
 199                 * always have blocks available for rmapbt expansion.
 200                 */
 201                hidden_space = ask;
 202                break;
 203        case XFS_AG_RESV_METADATA:
 204                /*
 205                 * Space taken by all other metadata btrees are accounted
 206                 * on-disk as used space.  We therefore only hide the space
 207                 * that is reserved but not used by the trees.
 208                 */
 209                hidden_space = ask - used;
 210                break;
 211        default:
 212                ASSERT(0);
 213                return -EINVAL;
 214        }
 215
 216        if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_AG_RESV_FAIL))
 217                error = -ENOSPC;
 218        else
 219                error = xfs_mod_fdblocks(mp, -(int64_t)hidden_space, true);
 220        if (error) {
 221                trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno,
 222                                error, _RET_IP_);
 223                xfs_warn(mp,
 224"Per-AG reservation for AG %u failed.  Filesystem may run out of space.",
 225                                pag->pag_agno);
 226                return error;
 227        }
 228
 229        /*
 230         * Reduce the maximum per-AG allocation length by however much we're
 231         * trying to reserve for an AG.  Since this is a filesystem-wide
 232         * counter, we only make the adjustment for AG 0.  This assumes that
 233         * there aren't any AGs hungrier for per-AG reservation than AG 0.
 234         */
 235        if (pag->pag_agno == 0)
 236                mp->m_ag_max_usable -= ask;
 237
 238        resv = xfs_perag_resv(pag, type);
 239        resv->ar_asked = ask;
 240        resv->ar_orig_reserved = hidden_space;
 241        resv->ar_reserved = ask - used;
 242
 243        trace_xfs_ag_resv_init(pag, type, ask);
 244        return 0;
 245}
 246
 247/* Create a per-AG block reservation. */
 248int
 249xfs_ag_resv_init(
 250        struct xfs_perag                *pag,
 251        struct xfs_trans                *tp)
 252{
 253        struct xfs_mount                *mp = pag->pag_mount;
 254        xfs_extlen_t                    ask;
 255        xfs_extlen_t                    used;
 256        int                             error = 0, error2;
 257        bool                            has_resv = false;
 258
 259        /* Create the metadata reservation. */
 260        if (pag->pag_meta_resv.ar_asked == 0) {
 261                ask = used = 0;
 262
 263                error = xfs_refcountbt_calc_reserves(mp, tp, pag, &ask, &used);
 264                if (error)
 265                        goto out;
 266
 267                error = xfs_finobt_calc_reserves(mp, tp, pag, &ask, &used);
 268                if (error)
 269                        goto out;
 270
 271                error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
 272                                ask, used);
 273                if (error) {
 274                        /*
 275                         * Because we didn't have per-AG reservations when the
 276                         * finobt feature was added we might not be able to
 277                         * reserve all needed blocks.  Warn and fall back to the
 278                         * old and potentially buggy code in that case, but
 279                         * ensure we do have the reservation for the refcountbt.
 280                         */
 281                        ask = used = 0;
 282
 283                        mp->m_finobt_nores = true;
 284
 285                        error = xfs_refcountbt_calc_reserves(mp, tp, pag, &ask,
 286                                        &used);
 287                        if (error)
 288                                goto out;
 289
 290                        error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
 291                                        ask, used);
 292                        if (error)
 293                                goto out;
 294                }
 295                if (ask)
 296                        has_resv = true;
 297        }
 298
 299        /* Create the RMAPBT metadata reservation */
 300        if (pag->pag_rmapbt_resv.ar_asked == 0) {
 301                ask = used = 0;
 302
 303                error = xfs_rmapbt_calc_reserves(mp, tp, pag, &ask, &used);
 304                if (error)
 305                        goto out;
 306
 307                error = __xfs_ag_resv_init(pag, XFS_AG_RESV_RMAPBT, ask, used);
 308                if (error)
 309                        goto out;
 310                if (ask)
 311                        has_resv = true;
 312        }
 313
 314out:
 315        /*
 316         * Initialize the pagf if we have at least one active reservation on the
 317         * AG. This may have occurred already via reservation calculation, but
 318         * fall back to an explicit init to ensure the in-core allocbt usage
 319         * counters are initialized as soon as possible. This is important
 320         * because filesystems with large perag reservations are susceptible to
 321         * free space reservation problems that the allocbt counter is used to
 322         * address.
 323         */
 324        if (has_resv) {
 325                error2 = xfs_alloc_pagf_init(mp, tp, pag->pag_agno, 0);
 326                if (error2)
 327                        return error2;
 328
 329                /*
 330                 * If there isn't enough space in the AG to satisfy the
 331                 * reservation, let the caller know that there wasn't enough
 332                 * space.  Callers are responsible for deciding what to do
 333                 * next, since (in theory) we can stumble along with
 334                 * insufficient reservation if data blocks are being freed to
 335                 * replenish the AG's free space.
 336                 */
 337                if (!error &&
 338                    xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved +
 339                    xfs_perag_resv(pag, XFS_AG_RESV_RMAPBT)->ar_reserved >
 340                    pag->pagf_freeblks + pag->pagf_flcount)
 341                        error = -ENOSPC;
 342        }
 343
 344        return error;
 345}
 346
 347/* Allocate a block from the reservation. */
 348void
 349xfs_ag_resv_alloc_extent(
 350        struct xfs_perag                *pag,
 351        enum xfs_ag_resv_type           type,
 352        struct xfs_alloc_arg            *args)
 353{
 354        struct xfs_ag_resv              *resv;
 355        xfs_extlen_t                    len;
 356        uint                            field;
 357
 358        trace_xfs_ag_resv_alloc_extent(pag, type, args->len);
 359
 360        switch (type) {
 361        case XFS_AG_RESV_AGFL:
 362                return;
 363        case XFS_AG_RESV_METADATA:
 364        case XFS_AG_RESV_RMAPBT:
 365                resv = xfs_perag_resv(pag, type);
 366                break;
 367        default:
 368                ASSERT(0);
 369                fallthrough;
 370        case XFS_AG_RESV_NONE:
 371                field = args->wasdel ? XFS_TRANS_SB_RES_FDBLOCKS :
 372                                       XFS_TRANS_SB_FDBLOCKS;
 373                xfs_trans_mod_sb(args->tp, field, -(int64_t)args->len);
 374                return;
 375        }
 376
 377        len = min_t(xfs_extlen_t, args->len, resv->ar_reserved);
 378        resv->ar_reserved -= len;
 379        if (type == XFS_AG_RESV_RMAPBT)
 380                return;
 381        /* Allocations of reserved blocks only need on-disk sb updates... */
 382        xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_RES_FDBLOCKS, -(int64_t)len);
 383        /* ...but non-reserved blocks need in-core and on-disk updates. */
 384        if (args->len > len)
 385                xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_FDBLOCKS,
 386                                -((int64_t)args->len - len));
 387}
 388
 389/* Free a block to the reservation. */
 390void
 391xfs_ag_resv_free_extent(
 392        struct xfs_perag                *pag,
 393        enum xfs_ag_resv_type           type,
 394        struct xfs_trans                *tp,
 395        xfs_extlen_t                    len)
 396{
 397        xfs_extlen_t                    leftover;
 398        struct xfs_ag_resv              *resv;
 399
 400        trace_xfs_ag_resv_free_extent(pag, type, len);
 401
 402        switch (type) {
 403        case XFS_AG_RESV_AGFL:
 404                return;
 405        case XFS_AG_RESV_METADATA:
 406        case XFS_AG_RESV_RMAPBT:
 407                resv = xfs_perag_resv(pag, type);
 408                break;
 409        default:
 410                ASSERT(0);
 411                fallthrough;
 412        case XFS_AG_RESV_NONE:
 413                xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (int64_t)len);
 414                return;
 415        }
 416
 417        leftover = min_t(xfs_extlen_t, len, resv->ar_asked - resv->ar_reserved);
 418        resv->ar_reserved += leftover;
 419        if (type == XFS_AG_RESV_RMAPBT)
 420                return;
 421        /* Freeing into the reserved pool only requires on-disk update... */
 422        xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FDBLOCKS, len);
 423        /* ...but freeing beyond that requires in-core and on-disk update. */
 424        if (len > leftover)
 425                xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, len - leftover);
 426}
 427