linux/fs/xfs/libxfs/xfs_ag_resv.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0+
   2/*
   3 * Copyright (C) 2016 Oracle.  All Rights Reserved.
   4 * Author: Darrick J. Wong <darrick.wong@oracle.com>
   5 */
   6#include "xfs.h"
   7#include "xfs_fs.h"
   8#include "xfs_shared.h"
   9#include "xfs_format.h"
  10#include "xfs_log_format.h"
  11#include "xfs_trans_resv.h"
  12#include "xfs_mount.h"
  13#include "xfs_alloc.h"
  14#include "xfs_errortag.h"
  15#include "xfs_error.h"
  16#include "xfs_trace.h"
  17#include "xfs_trans.h"
  18#include "xfs_rmap_btree.h"
  19#include "xfs_btree.h"
  20#include "xfs_refcount_btree.h"
  21#include "xfs_ialloc_btree.h"
  22#include "xfs_ag.h"
  23#include "xfs_ag_resv.h"
  24
  25/*
  26 * Per-AG Block Reservations
  27 *
  28 * For some kinds of allocation group metadata structures, it is advantageous
  29 * to reserve a small number of blocks in each AG so that future expansions of
  30 * that data structure do not encounter ENOSPC because errors during a btree
  31 * split cause the filesystem to go offline.
  32 *
  33 * Prior to the introduction of reflink, this wasn't an issue because the free
  34 * space btrees maintain a reserve of space (the AGFL) to handle any expansion
  35 * that may be necessary; and allocations of other metadata (inodes, BMBT,
  36 * dir/attr) aren't restricted to a single AG.  However, with reflink it is
  37 * possible to allocate all the space in an AG, have subsequent reflink/CoW
  38 * activity expand the refcount btree, and discover that there's no space left
  39 * to handle that expansion.  Since we can calculate the maximum size of the
  40 * refcount btree, we can reserve space for it and avoid ENOSPC.
  41 *
  42 * Handling per-AG reservations consists of three changes to the allocator's
  43 * behavior:  First, because these reservations are always needed, we decrease
  44 * the ag_max_usable counter to reflect the size of the AG after the reserved
  45 * blocks are taken.  Second, the reservations must be reflected in the
  46 * fdblocks count to maintain proper accounting.  Third, each AG must maintain
  47 * its own reserved block counter so that we can calculate the amount of space
  48 * that must remain free to maintain the reservations.  Fourth, the "remaining
  49 * reserved blocks" count must be used when calculating the length of the
  50 * longest free extent in an AG and to clamp maxlen in the per-AG allocation
  51 * functions.  In other words, we maintain a virtual allocation via in-core
  52 * accounting tricks so that we don't have to clean up after a crash. :)
  53 *
  54 * Reserved blocks can be managed by passing one of the enum xfs_ag_resv_type
  55 * values via struct xfs_alloc_arg or directly to the xfs_free_extent
  56 * function.  It might seem a little funny to maintain a reservoir of blocks
  57 * to feed another reservoir, but the AGFL only holds enough blocks to get
  58 * through the next transaction.  The per-AG reservation is to ensure (we
  59 * hope) that each AG never runs out of blocks.  Each data structure wanting
  60 * to use the reservation system should update ask/used in xfs_ag_resv_init.
  61 */
  62
  63/*
  64 * Are we critically low on blocks?  For now we'll define that as the number
  65 * of blocks we can get our hands on being less than 10% of what we reserved
  66 * or less than some arbitrary number (maximum btree height).
  67 */
  68bool
  69xfs_ag_resv_critical(
  70        struct xfs_perag                *pag,
  71        enum xfs_ag_resv_type           type)
  72{
  73        xfs_extlen_t                    avail;
  74        xfs_extlen_t                    orig;
  75
  76        switch (type) {
  77        case XFS_AG_RESV_METADATA:
  78                avail = pag->pagf_freeblks - pag->pag_rmapbt_resv.ar_reserved;
  79                orig = pag->pag_meta_resv.ar_asked;
  80                break;
  81        case XFS_AG_RESV_RMAPBT:
  82                avail = pag->pagf_freeblks + pag->pagf_flcount -
  83                        pag->pag_meta_resv.ar_reserved;
  84                orig = pag->pag_rmapbt_resv.ar_asked;
  85                break;
  86        default:
  87                ASSERT(0);
  88                return false;
  89        }
  90
  91        trace_xfs_ag_resv_critical(pag, type, avail);
  92
  93        /* Critically low if less than 10% or max btree height remains. */
  94        return XFS_TEST_ERROR(avail < orig / 10 || avail < XFS_BTREE_MAXLEVELS,
  95                        pag->pag_mount, XFS_ERRTAG_AG_RESV_CRITICAL);
  96}
  97
  98/*
  99 * How many blocks are reserved but not used, and therefore must not be
 100 * allocated away?
 101 */
 102xfs_extlen_t
 103xfs_ag_resv_needed(
 104        struct xfs_perag                *pag,
 105        enum xfs_ag_resv_type           type)
 106{
 107        xfs_extlen_t                    len;
 108
 109        len = pag->pag_meta_resv.ar_reserved + pag->pag_rmapbt_resv.ar_reserved;
 110        switch (type) {
 111        case XFS_AG_RESV_METADATA:
 112        case XFS_AG_RESV_RMAPBT:
 113                len -= xfs_perag_resv(pag, type)->ar_reserved;
 114                break;
 115        case XFS_AG_RESV_NONE:
 116                /* empty */
 117                break;
 118        default:
 119                ASSERT(0);
 120        }
 121
 122        trace_xfs_ag_resv_needed(pag, type, len);
 123
 124        return len;
 125}
 126
 127/* Clean out a reservation */
 128static int
 129__xfs_ag_resv_free(
 130        struct xfs_perag                *pag,
 131        enum xfs_ag_resv_type           type)
 132{
 133        struct xfs_ag_resv              *resv;
 134        xfs_extlen_t                    oldresv;
 135        int                             error;
 136
 137        trace_xfs_ag_resv_free(pag, type, 0);
 138
 139        resv = xfs_perag_resv(pag, type);
 140        if (pag->pag_agno == 0)
 141                pag->pag_mount->m_ag_max_usable += resv->ar_asked;
 142        /*
 143         * RMAPBT blocks come from the AGFL and AGFL blocks are always
 144         * considered "free", so whatever was reserved at mount time must be
 145         * given back at umount.
 146         */
 147        if (type == XFS_AG_RESV_RMAPBT)
 148                oldresv = resv->ar_orig_reserved;
 149        else
 150                oldresv = resv->ar_reserved;
 151        error = xfs_mod_fdblocks(pag->pag_mount, oldresv, true);
 152        resv->ar_reserved = 0;
 153        resv->ar_asked = 0;
 154        resv->ar_orig_reserved = 0;
 155
 156        if (error)
 157                trace_xfs_ag_resv_free_error(pag->pag_mount, pag->pag_agno,
 158                                error, _RET_IP_);
 159        return error;
 160}
 161
 162/* Free a per-AG reservation. */
 163int
 164xfs_ag_resv_free(
 165        struct xfs_perag                *pag)
 166{
 167        int                             error;
 168        int                             err2;
 169
 170        error = __xfs_ag_resv_free(pag, XFS_AG_RESV_RMAPBT);
 171        err2 = __xfs_ag_resv_free(pag, XFS_AG_RESV_METADATA);
 172        if (err2 && !error)
 173                error = err2;
 174        return error;
 175}
 176
 177static int
 178__xfs_ag_resv_init(
 179        struct xfs_perag                *pag,
 180        enum xfs_ag_resv_type           type,
 181        xfs_extlen_t                    ask,
 182        xfs_extlen_t                    used)
 183{
 184        struct xfs_mount                *mp = pag->pag_mount;
 185        struct xfs_ag_resv              *resv;
 186        int                             error;
 187        xfs_extlen_t                    hidden_space;
 188
 189        if (used > ask)
 190                ask = used;
 191
 192        switch (type) {
 193        case XFS_AG_RESV_RMAPBT:
 194                /*
 195                 * Space taken by the rmapbt is not subtracted from fdblocks
 196                 * because the rmapbt lives in the free space.  Here we must
 197                 * subtract the entire reservation from fdblocks so that we
 198                 * always have blocks available for rmapbt expansion.
 199                 */
 200                hidden_space = ask;
 201                break;
 202        case XFS_AG_RESV_METADATA:
 203                /*
 204                 * Space taken by all other metadata btrees are accounted
 205                 * on-disk as used space.  We therefore only hide the space
 206                 * that is reserved but not used by the trees.
 207                 */
 208                hidden_space = ask - used;
 209                break;
 210        default:
 211                ASSERT(0);
 212                return -EINVAL;
 213        }
 214
 215        if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_AG_RESV_FAIL))
 216                error = -ENOSPC;
 217        else
 218                error = xfs_mod_fdblocks(mp, -(int64_t)hidden_space, true);
 219        if (error) {
 220                trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno,
 221                                error, _RET_IP_);
 222                xfs_warn(mp,
 223"Per-AG reservation for AG %u failed.  Filesystem may run out of space.",
 224                                pag->pag_agno);
 225                return error;
 226        }
 227
 228        /*
 229         * Reduce the maximum per-AG allocation length by however much we're
 230         * trying to reserve for an AG.  Since this is a filesystem-wide
 231         * counter, we only make the adjustment for AG 0.  This assumes that
 232         * there aren't any AGs hungrier for per-AG reservation than AG 0.
 233         */
 234        if (pag->pag_agno == 0)
 235                mp->m_ag_max_usable -= ask;
 236
 237        resv = xfs_perag_resv(pag, type);
 238        resv->ar_asked = ask;
 239        resv->ar_orig_reserved = hidden_space;
 240        resv->ar_reserved = ask - used;
 241
 242        trace_xfs_ag_resv_init(pag, type, ask);
 243        return 0;
 244}
 245
 246/* Create a per-AG block reservation. */
 247int
 248xfs_ag_resv_init(
 249        struct xfs_perag                *pag,
 250        struct xfs_trans                *tp)
 251{
 252        struct xfs_mount                *mp = pag->pag_mount;
 253        xfs_extlen_t                    ask;
 254        xfs_extlen_t                    used;
 255        int                             error = 0, error2;
 256        bool                            has_resv = false;
 257
 258        /* Create the metadata reservation. */
 259        if (pag->pag_meta_resv.ar_asked == 0) {
 260                ask = used = 0;
 261
 262                error = xfs_refcountbt_calc_reserves(mp, tp, pag, &ask, &used);
 263                if (error)
 264                        goto out;
 265
 266                error = xfs_finobt_calc_reserves(mp, tp, pag, &ask, &used);
 267                if (error)
 268                        goto out;
 269
 270                error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
 271                                ask, used);
 272                if (error) {
 273                        /*
 274                         * Because we didn't have per-AG reservations when the
 275                         * finobt feature was added we might not be able to
 276                         * reserve all needed blocks.  Warn and fall back to the
 277                         * old and potentially buggy code in that case, but
 278                         * ensure we do have the reservation for the refcountbt.
 279                         */
 280                        ask = used = 0;
 281
 282                        mp->m_finobt_nores = true;
 283
 284                        error = xfs_refcountbt_calc_reserves(mp, tp, pag, &ask,
 285                                        &used);
 286                        if (error)
 287                                goto out;
 288
 289                        error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
 290                                        ask, used);
 291                        if (error)
 292                                goto out;
 293                }
 294                if (ask)
 295                        has_resv = true;
 296        }
 297
 298        /* Create the RMAPBT metadata reservation */
 299        if (pag->pag_rmapbt_resv.ar_asked == 0) {
 300                ask = used = 0;
 301
 302                error = xfs_rmapbt_calc_reserves(mp, tp, pag, &ask, &used);
 303                if (error)
 304                        goto out;
 305
 306                error = __xfs_ag_resv_init(pag, XFS_AG_RESV_RMAPBT, ask, used);
 307                if (error)
 308                        goto out;
 309                if (ask)
 310                        has_resv = true;
 311        }
 312
 313out:
 314        /*
 315         * Initialize the pagf if we have at least one active reservation on the
 316         * AG. This may have occurred already via reservation calculation, but
 317         * fall back to an explicit init to ensure the in-core allocbt usage
 318         * counters are initialized as soon as possible. This is important
 319         * because filesystems with large perag reservations are susceptible to
 320         * free space reservation problems that the allocbt counter is used to
 321         * address.
 322         */
 323        if (has_resv) {
 324                error2 = xfs_alloc_pagf_init(mp, tp, pag->pag_agno, 0);
 325                if (error2)
 326                        return error2;
 327
 328                /*
 329                 * If there isn't enough space in the AG to satisfy the
 330                 * reservation, let the caller know that there wasn't enough
 331                 * space.  Callers are responsible for deciding what to do
 332                 * next, since (in theory) we can stumble along with
 333                 * insufficient reservation if data blocks are being freed to
 334                 * replenish the AG's free space.
 335                 */
 336                if (!error &&
 337                    xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved +
 338                    xfs_perag_resv(pag, XFS_AG_RESV_RMAPBT)->ar_reserved >
 339                    pag->pagf_freeblks + pag->pagf_flcount)
 340                        error = -ENOSPC;
 341        }
 342
 343        return error;
 344}
 345
 346/* Allocate a block from the reservation. */
 347void
 348xfs_ag_resv_alloc_extent(
 349        struct xfs_perag                *pag,
 350        enum xfs_ag_resv_type           type,
 351        struct xfs_alloc_arg            *args)
 352{
 353        struct xfs_ag_resv              *resv;
 354        xfs_extlen_t                    len;
 355        uint                            field;
 356
 357        trace_xfs_ag_resv_alloc_extent(pag, type, args->len);
 358
 359        switch (type) {
 360        case XFS_AG_RESV_AGFL:
 361                return;
 362        case XFS_AG_RESV_METADATA:
 363        case XFS_AG_RESV_RMAPBT:
 364                resv = xfs_perag_resv(pag, type);
 365                break;
 366        default:
 367                ASSERT(0);
 368                fallthrough;
 369        case XFS_AG_RESV_NONE:
 370                field = args->wasdel ? XFS_TRANS_SB_RES_FDBLOCKS :
 371                                       XFS_TRANS_SB_FDBLOCKS;
 372                xfs_trans_mod_sb(args->tp, field, -(int64_t)args->len);
 373                return;
 374        }
 375
 376        len = min_t(xfs_extlen_t, args->len, resv->ar_reserved);
 377        resv->ar_reserved -= len;
 378        if (type == XFS_AG_RESV_RMAPBT)
 379                return;
 380        /* Allocations of reserved blocks only need on-disk sb updates... */
 381        xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_RES_FDBLOCKS, -(int64_t)len);
 382        /* ...but non-reserved blocks need in-core and on-disk updates. */
 383        if (args->len > len)
 384                xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_FDBLOCKS,
 385                                -((int64_t)args->len - len));
 386}
 387
 388/* Free a block to the reservation. */
 389void
 390xfs_ag_resv_free_extent(
 391        struct xfs_perag                *pag,
 392        enum xfs_ag_resv_type           type,
 393        struct xfs_trans                *tp,
 394        xfs_extlen_t                    len)
 395{
 396        xfs_extlen_t                    leftover;
 397        struct xfs_ag_resv              *resv;
 398
 399        trace_xfs_ag_resv_free_extent(pag, type, len);
 400
 401        switch (type) {
 402        case XFS_AG_RESV_AGFL:
 403                return;
 404        case XFS_AG_RESV_METADATA:
 405        case XFS_AG_RESV_RMAPBT:
 406                resv = xfs_perag_resv(pag, type);
 407                break;
 408        default:
 409                ASSERT(0);
 410                fallthrough;
 411        case XFS_AG_RESV_NONE:
 412                xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (int64_t)len);
 413                return;
 414        }
 415
 416        leftover = min_t(xfs_extlen_t, len, resv->ar_asked - resv->ar_reserved);
 417        resv->ar_reserved += leftover;
 418        if (type == XFS_AG_RESV_RMAPBT)
 419                return;
 420        /* Freeing into the reserved pool only requires on-disk update... */
 421        xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FDBLOCKS, len);
 422        /* ...but freeing beyond that requires in-core and on-disk update. */
 423        if (len > leftover)
 424                xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, len - leftover);
 425}
 426