linux/fs/xfs/libxfs/xfs_ag_resv.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0+
   2/*
   3 * Copyright (C) 2016 Oracle.  All Rights Reserved.
   4 * Author: Darrick J. Wong <darrick.wong@oracle.com>
   5 */
   6#include "xfs.h"
   7#include "xfs_fs.h"
   8#include "xfs_shared.h"
   9#include "xfs_format.h"
  10#include "xfs_log_format.h"
  11#include "xfs_trans_resv.h"
  12#include "xfs_mount.h"
  13#include "xfs_alloc.h"
  14#include "xfs_errortag.h"
  15#include "xfs_error.h"
  16#include "xfs_trace.h"
  17#include "xfs_trans.h"
  18#include "xfs_rmap_btree.h"
  19#include "xfs_btree.h"
  20#include "xfs_refcount_btree.h"
  21#include "xfs_ialloc_btree.h"
  22
  23/*
  24 * Per-AG Block Reservations
  25 *
  26 * For some kinds of allocation group metadata structures, it is advantageous
  27 * to reserve a small number of blocks in each AG so that future expansions of
  28 * that data structure do not encounter ENOSPC because errors during a btree
  29 * split cause the filesystem to go offline.
  30 *
  31 * Prior to the introduction of reflink, this wasn't an issue because the free
  32 * space btrees maintain a reserve of space (the AGFL) to handle any expansion
  33 * that may be necessary; and allocations of other metadata (inodes, BMBT,
  34 * dir/attr) aren't restricted to a single AG.  However, with reflink it is
  35 * possible to allocate all the space in an AG, have subsequent reflink/CoW
  36 * activity expand the refcount btree, and discover that there's no space left
  37 * to handle that expansion.  Since we can calculate the maximum size of the
  38 * refcount btree, we can reserve space for it and avoid ENOSPC.
  39 *
  40 * Handling per-AG reservations consists of three changes to the allocator's
  41 * behavior:  First, because these reservations are always needed, we decrease
  42 * the ag_max_usable counter to reflect the size of the AG after the reserved
  43 * blocks are taken.  Second, the reservations must be reflected in the
  44 * fdblocks count to maintain proper accounting.  Third, each AG must maintain
  45 * its own reserved block counter so that we can calculate the amount of space
  46 * that must remain free to maintain the reservations.  Fourth, the "remaining
  47 * reserved blocks" count must be used when calculating the length of the
  48 * longest free extent in an AG and to clamp maxlen in the per-AG allocation
  49 * functions.  In other words, we maintain a virtual allocation via in-core
  50 * accounting tricks so that we don't have to clean up after a crash. :)
  51 *
  52 * Reserved blocks can be managed by passing one of the enum xfs_ag_resv_type
  53 * values via struct xfs_alloc_arg or directly to the xfs_free_extent
  54 * function.  It might seem a little funny to maintain a reservoir of blocks
  55 * to feed another reservoir, but the AGFL only holds enough blocks to get
  56 * through the next transaction.  The per-AG reservation is to ensure (we
  57 * hope) that each AG never runs out of blocks.  Each data structure wanting
  58 * to use the reservation system should update ask/used in xfs_ag_resv_init.
  59 */
  60
  61/*
  62 * Are we critically low on blocks?  For now we'll define that as the number
  63 * of blocks we can get our hands on being less than 10% of what we reserved
  64 * or less than some arbitrary number (maximum btree height).
  65 */
  66bool
  67xfs_ag_resv_critical(
  68        struct xfs_perag                *pag,
  69        enum xfs_ag_resv_type           type)
  70{
  71        xfs_extlen_t                    avail;
  72        xfs_extlen_t                    orig;
  73
  74        switch (type) {
  75        case XFS_AG_RESV_METADATA:
  76                avail = pag->pagf_freeblks - pag->pag_rmapbt_resv.ar_reserved;
  77                orig = pag->pag_meta_resv.ar_asked;
  78                break;
  79        case XFS_AG_RESV_RMAPBT:
  80                avail = pag->pagf_freeblks + pag->pagf_flcount -
  81                        pag->pag_meta_resv.ar_reserved;
  82                orig = pag->pag_rmapbt_resv.ar_asked;
  83                break;
  84        default:
  85                ASSERT(0);
  86                return false;
  87        }
  88
  89        trace_xfs_ag_resv_critical(pag, type, avail);
  90
  91        /* Critically low if less than 10% or max btree height remains. */
  92        return XFS_TEST_ERROR(avail < orig / 10 || avail < XFS_BTREE_MAXLEVELS,
  93                        pag->pag_mount, XFS_ERRTAG_AG_RESV_CRITICAL);
  94}
  95
  96/*
  97 * How many blocks are reserved but not used, and therefore must not be
  98 * allocated away?
  99 */
 100xfs_extlen_t
 101xfs_ag_resv_needed(
 102        struct xfs_perag                *pag,
 103        enum xfs_ag_resv_type           type)
 104{
 105        xfs_extlen_t                    len;
 106
 107        len = pag->pag_meta_resv.ar_reserved + pag->pag_rmapbt_resv.ar_reserved;
 108        switch (type) {
 109        case XFS_AG_RESV_METADATA:
 110        case XFS_AG_RESV_RMAPBT:
 111                len -= xfs_perag_resv(pag, type)->ar_reserved;
 112                break;
 113        case XFS_AG_RESV_NONE:
 114                /* empty */
 115                break;
 116        default:
 117                ASSERT(0);
 118        }
 119
 120        trace_xfs_ag_resv_needed(pag, type, len);
 121
 122        return len;
 123}
 124
 125/* Clean out a reservation */
 126static int
 127__xfs_ag_resv_free(
 128        struct xfs_perag                *pag,
 129        enum xfs_ag_resv_type           type)
 130{
 131        struct xfs_ag_resv              *resv;
 132        xfs_extlen_t                    oldresv;
 133        int                             error;
 134
 135        trace_xfs_ag_resv_free(pag, type, 0);
 136
 137        resv = xfs_perag_resv(pag, type);
 138        if (pag->pag_agno == 0)
 139                pag->pag_mount->m_ag_max_usable += resv->ar_asked;
 140        /*
 141         * RMAPBT blocks come from the AGFL and AGFL blocks are always
 142         * considered "free", so whatever was reserved at mount time must be
 143         * given back at umount.
 144         */
 145        if (type == XFS_AG_RESV_RMAPBT)
 146                oldresv = resv->ar_orig_reserved;
 147        else
 148                oldresv = resv->ar_reserved;
 149        error = xfs_mod_fdblocks(pag->pag_mount, oldresv, true);
 150        resv->ar_reserved = 0;
 151        resv->ar_asked = 0;
 152        resv->ar_orig_reserved = 0;
 153
 154        if (error)
 155                trace_xfs_ag_resv_free_error(pag->pag_mount, pag->pag_agno,
 156                                error, _RET_IP_);
 157        return error;
 158}
 159
 160/* Free a per-AG reservation. */
 161int
 162xfs_ag_resv_free(
 163        struct xfs_perag                *pag)
 164{
 165        int                             error;
 166        int                             err2;
 167
 168        error = __xfs_ag_resv_free(pag, XFS_AG_RESV_RMAPBT);
 169        err2 = __xfs_ag_resv_free(pag, XFS_AG_RESV_METADATA);
 170        if (err2 && !error)
 171                error = err2;
 172        return error;
 173}
 174
 175static int
 176__xfs_ag_resv_init(
 177        struct xfs_perag                *pag,
 178        enum xfs_ag_resv_type           type,
 179        xfs_extlen_t                    ask,
 180        xfs_extlen_t                    used)
 181{
 182        struct xfs_mount                *mp = pag->pag_mount;
 183        struct xfs_ag_resv              *resv;
 184        int                             error;
 185        xfs_extlen_t                    hidden_space;
 186
 187        if (used > ask)
 188                ask = used;
 189
 190        switch (type) {
 191        case XFS_AG_RESV_RMAPBT:
 192                /*
 193                 * Space taken by the rmapbt is not subtracted from fdblocks
 194                 * because the rmapbt lives in the free space.  Here we must
 195                 * subtract the entire reservation from fdblocks so that we
 196                 * always have blocks available for rmapbt expansion.
 197                 */
 198                hidden_space = ask;
 199                break;
 200        case XFS_AG_RESV_METADATA:
 201                /*
 202                 * Space taken by all other metadata btrees are accounted
 203                 * on-disk as used space.  We therefore only hide the space
 204                 * that is reserved but not used by the trees.
 205                 */
 206                hidden_space = ask - used;
 207                break;
 208        default:
 209                ASSERT(0);
 210                return -EINVAL;
 211        }
 212        error = xfs_mod_fdblocks(mp, -(int64_t)hidden_space, true);
 213        if (error) {
 214                trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno,
 215                                error, _RET_IP_);
 216                xfs_warn(mp,
 217"Per-AG reservation for AG %u failed.  Filesystem may run out of space.",
 218                                pag->pag_agno);
 219                return error;
 220        }
 221
 222        /*
 223         * Reduce the maximum per-AG allocation length by however much we're
 224         * trying to reserve for an AG.  Since this is a filesystem-wide
 225         * counter, we only make the adjustment for AG 0.  This assumes that
 226         * there aren't any AGs hungrier for per-AG reservation than AG 0.
 227         */
 228        if (pag->pag_agno == 0)
 229                mp->m_ag_max_usable -= ask;
 230
 231        resv = xfs_perag_resv(pag, type);
 232        resv->ar_asked = ask;
 233        resv->ar_orig_reserved = hidden_space;
 234        resv->ar_reserved = ask - used;
 235
 236        trace_xfs_ag_resv_init(pag, type, ask);
 237        return 0;
 238}
 239
 240/* Create a per-AG block reservation. */
 241int
 242xfs_ag_resv_init(
 243        struct xfs_perag                *pag,
 244        struct xfs_trans                *tp)
 245{
 246        struct xfs_mount                *mp = pag->pag_mount;
 247        xfs_agnumber_t                  agno = pag->pag_agno;
 248        xfs_extlen_t                    ask;
 249        xfs_extlen_t                    used;
 250        int                             error = 0;
 251
 252        /* Create the metadata reservation. */
 253        if (pag->pag_meta_resv.ar_asked == 0) {
 254                ask = used = 0;
 255
 256                error = xfs_refcountbt_calc_reserves(mp, tp, agno, &ask, &used);
 257                if (error)
 258                        goto out;
 259
 260                error = xfs_finobt_calc_reserves(mp, tp, agno, &ask, &used);
 261                if (error)
 262                        goto out;
 263
 264                error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
 265                                ask, used);
 266                if (error) {
 267                        /*
 268                         * Because we didn't have per-AG reservations when the
 269                         * finobt feature was added we might not be able to
 270                         * reserve all needed blocks.  Warn and fall back to the
 271                         * old and potentially buggy code in that case, but
 272                         * ensure we do have the reservation for the refcountbt.
 273                         */
 274                        ask = used = 0;
 275
 276                        mp->m_finobt_nores = true;
 277
 278                        error = xfs_refcountbt_calc_reserves(mp, tp, agno, &ask,
 279                                        &used);
 280                        if (error)
 281                                goto out;
 282
 283                        error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
 284                                        ask, used);
 285                        if (error)
 286                                goto out;
 287                }
 288        }
 289
 290        /* Create the RMAPBT metadata reservation */
 291        if (pag->pag_rmapbt_resv.ar_asked == 0) {
 292                ask = used = 0;
 293
 294                error = xfs_rmapbt_calc_reserves(mp, tp, agno, &ask, &used);
 295                if (error)
 296                        goto out;
 297
 298                error = __xfs_ag_resv_init(pag, XFS_AG_RESV_RMAPBT, ask, used);
 299                if (error)
 300                        goto out;
 301        }
 302
 303#ifdef DEBUG
 304        /* need to read in the AGF for the ASSERT below to work */
 305        error = xfs_alloc_pagf_init(pag->pag_mount, tp, pag->pag_agno, 0);
 306        if (error)
 307                return error;
 308
 309        ASSERT(xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved +
 310               xfs_perag_resv(pag, XFS_AG_RESV_RMAPBT)->ar_reserved <=
 311               pag->pagf_freeblks + pag->pagf_flcount);
 312#endif
 313out:
 314        return error;
 315}
 316
 317/* Allocate a block from the reservation. */
 318void
 319xfs_ag_resv_alloc_extent(
 320        struct xfs_perag                *pag,
 321        enum xfs_ag_resv_type           type,
 322        struct xfs_alloc_arg            *args)
 323{
 324        struct xfs_ag_resv              *resv;
 325        xfs_extlen_t                    len;
 326        uint                            field;
 327
 328        trace_xfs_ag_resv_alloc_extent(pag, type, args->len);
 329
 330        switch (type) {
 331        case XFS_AG_RESV_AGFL:
 332                return;
 333        case XFS_AG_RESV_METADATA:
 334        case XFS_AG_RESV_RMAPBT:
 335                resv = xfs_perag_resv(pag, type);
 336                break;
 337        default:
 338                ASSERT(0);
 339                /* fall through */
 340        case XFS_AG_RESV_NONE:
 341                field = args->wasdel ? XFS_TRANS_SB_RES_FDBLOCKS :
 342                                       XFS_TRANS_SB_FDBLOCKS;
 343                xfs_trans_mod_sb(args->tp, field, -(int64_t)args->len);
 344                return;
 345        }
 346
 347        len = min_t(xfs_extlen_t, args->len, resv->ar_reserved);
 348        resv->ar_reserved -= len;
 349        if (type == XFS_AG_RESV_RMAPBT)
 350                return;
 351        /* Allocations of reserved blocks only need on-disk sb updates... */
 352        xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_RES_FDBLOCKS, -(int64_t)len);
 353        /* ...but non-reserved blocks need in-core and on-disk updates. */
 354        if (args->len > len)
 355                xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_FDBLOCKS,
 356                                -((int64_t)args->len - len));
 357}
 358
 359/* Free a block to the reservation. */
 360void
 361xfs_ag_resv_free_extent(
 362        struct xfs_perag                *pag,
 363        enum xfs_ag_resv_type           type,
 364        struct xfs_trans                *tp,
 365        xfs_extlen_t                    len)
 366{
 367        xfs_extlen_t                    leftover;
 368        struct xfs_ag_resv              *resv;
 369
 370        trace_xfs_ag_resv_free_extent(pag, type, len);
 371
 372        switch (type) {
 373        case XFS_AG_RESV_AGFL:
 374                return;
 375        case XFS_AG_RESV_METADATA:
 376        case XFS_AG_RESV_RMAPBT:
 377                resv = xfs_perag_resv(pag, type);
 378                break;
 379        default:
 380                ASSERT(0);
 381                /* fall through */
 382        case XFS_AG_RESV_NONE:
 383                xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (int64_t)len);
 384                return;
 385        }
 386
 387        leftover = min_t(xfs_extlen_t, len, resv->ar_asked - resv->ar_reserved);
 388        resv->ar_reserved += leftover;
 389        if (type == XFS_AG_RESV_RMAPBT)
 390                return;
 391        /* Freeing into the reserved pool only requires on-disk update... */
 392        xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FDBLOCKS, len);
 393        /* ...but freeing beyond that requires in-core and on-disk update. */
 394        if (len > leftover)
 395                xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, len - leftover);
 396}
 397