linux/fs/xfs/libxfs/xfs_defer.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0+
   2/*
   3 * Copyright (C) 2016 Oracle.  All Rights Reserved.
   4 * Author: Darrick J. Wong <darrick.wong@oracle.com>
   5 */
   6#include "xfs.h"
   7#include "xfs_fs.h"
   8#include "xfs_shared.h"
   9#include "xfs_format.h"
  10#include "xfs_log_format.h"
  11#include "xfs_trans_resv.h"
  12#include "xfs_mount.h"
  13#include "xfs_defer.h"
  14#include "xfs_trans.h"
  15#include "xfs_buf_item.h"
  16#include "xfs_inode.h"
  17#include "xfs_inode_item.h"
  18#include "xfs_trace.h"
  19
  20/*
  21 * Deferred Operations in XFS
  22 *
  23 * Due to the way locking rules work in XFS, certain transactions (block
  24 * mapping and unmapping, typically) have permanent reservations so that
  25 * we can roll the transaction to adhere to AG locking order rules and
  26 * to unlock buffers between metadata updates.  Prior to rmap/reflink,
  27 * the mapping code had a mechanism to perform these deferrals for
  28 * extents that were going to be freed; this code makes that facility
  29 * more generic.
  30 *
  31 * When adding the reverse mapping and reflink features, it became
  32 * necessary to perform complex remapping multi-transactions to comply
  33 * with AG locking order rules, and to be able to spread a single
  34 * refcount update operation (an operation on an n-block extent can
  35 * update as many as n records!) among multiple transactions.  XFS can
  36 * roll a transaction to facilitate this, but using this facility
  37 * requires us to log "intent" items in case log recovery needs to
  38 * redo the operation, and to log "done" items to indicate that redo
  39 * is not necessary.
  40 *
  41 * Deferred work is tracked in xfs_defer_pending items.  Each pending
  42 * item tracks one type of deferred work.  Incoming work items (which
  43 * have not yet had an intent logged) are attached to a pending item
  44 * on the dop_intake list, where they wait for the caller to finish
  45 * the deferred operations.
  46 *
  47 * Finishing a set of deferred operations is an involved process.  To
  48 * start, we define "rolling a deferred-op transaction" as follows:
  49 *
  50 * > For each xfs_defer_pending item on the dop_intake list,
  51 *   - Sort the work items in AG order.  XFS locking
  52 *     order rules require us to lock buffers in AG order.
  53 *   - Create a log intent item for that type.
  54 *   - Attach it to the pending item.
  55 *   - Move the pending item from the dop_intake list to the
  56 *     dop_pending list.
  57 * > Roll the transaction.
  58 *
  59 * NOTE: To avoid exceeding the transaction reservation, we limit the
  60 * number of items that we attach to a given xfs_defer_pending.
  61 *
  62 * The actual finishing process looks like this:
  63 *
  64 * > For each xfs_defer_pending in the dop_pending list,
  65 *   - Roll the deferred-op transaction as above.
  66 *   - Create a log done item for that type, and attach it to the
  67 *     log intent item.
  68 *   - For each work item attached to the log intent item,
  69 *     * Perform the described action.
  70 *     * Attach the work item to the log done item.
  71 *     * If the result of doing the work was -EAGAIN, ->finish work
  72 *       wants a new transaction.  See the "Requesting a Fresh
  73 *       Transaction while Finishing Deferred Work" section below for
  74 *       details.
  75 *
  76 * The key here is that we must log an intent item for all pending
  77 * work items every time we roll the transaction, and that we must log
  78 * a done item as soon as the work is completed.  With this mechanism
  79 * we can perform complex remapping operations, chaining intent items
  80 * as needed.
  81 *
  82 * Requesting a Fresh Transaction while Finishing Deferred Work
  83 *
  84 * If ->finish_item decides that it needs a fresh transaction to
  85 * finish the work, it must ask its caller (xfs_defer_finish) for a
  86 * continuation.  The most likely cause of this circumstance are the
  87 * refcount adjust functions deciding that they've logged enough items
  88 * to be at risk of exceeding the transaction reservation.
  89 *
  90 * To get a fresh transaction, we want to log the existing log done
  91 * item to prevent the log intent item from replaying, immediately log
  92 * a new log intent item with the unfinished work items, roll the
  93 * transaction, and re-call ->finish_item wherever it left off.  The
  94 * log done item and the new log intent item must be in the same
  95 * transaction or atomicity cannot be guaranteed; defer_finish ensures
  96 * that this happens.
  97 *
  98 * This requires some coordination between ->finish_item and
  99 * defer_finish.  Upon deciding to request a new transaction,
 100 * ->finish_item should update the current work item to reflect the
 101 * unfinished work.  Next, it should reset the log done item's list
 102 * count to the number of items finished, and return -EAGAIN.
 103 * defer_finish sees the -EAGAIN, logs the new log intent item
 104 * with the remaining work items, and leaves the xfs_defer_pending
 105 * item at the head of the dop_work queue.  Then it rolls the
 106 * transaction and picks up processing where it left off.  It is
 107 * required that ->finish_item must be careful to leave enough
 108 * transaction reservation to fit the new log intent item.
 109 *
 110 * This is an example of remapping the extent (E, E+B) into file X at
 111 * offset A and dealing with the extent (C, C+B) already being mapped
 112 * there:
 113 * +-------------------------------------------------+
 114 * | Unmap file X startblock C offset A length B     | t0
 115 * | Intent to reduce refcount for extent (C, B)     |
 116 * | Intent to remove rmap (X, C, A, B)              |
 117 * | Intent to free extent (D, 1) (bmbt block)       |
 118 * | Intent to map (X, A, B) at startblock E         |
 119 * +-------------------------------------------------+
 120 * | Map file X startblock E offset A length B       | t1
 121 * | Done mapping (X, E, A, B)                       |
 122 * | Intent to increase refcount for extent (E, B)   |
 123 * | Intent to add rmap (X, E, A, B)                 |
 124 * +-------------------------------------------------+
 125 * | Reduce refcount for extent (C, B)               | t2
 126 * | Done reducing refcount for extent (C, 9)        |
 127 * | Intent to reduce refcount for extent (C+9, B-9) |
 128 * | (ran out of space after 9 refcount updates)     |
 129 * +-------------------------------------------------+
 130 * | Reduce refcount for extent (C+9, B+9)           | t3
 131 * | Done reducing refcount for extent (C+9, B-9)    |
 132 * | Increase refcount for extent (E, B)             |
 133 * | Done increasing refcount for extent (E, B)      |
 134 * | Intent to free extent (C, B)                    |
 135 * | Intent to free extent (F, 1) (refcountbt block) |
 136 * | Intent to remove rmap (F, 1, REFC)              |
 137 * +-------------------------------------------------+
 138 * | Remove rmap (X, C, A, B)                        | t4
 139 * | Done removing rmap (X, C, A, B)                 |
 140 * | Add rmap (X, E, A, B)                           |
 141 * | Done adding rmap (X, E, A, B)                   |
 142 * | Remove rmap (F, 1, REFC)                        |
 143 * | Done removing rmap (F, 1, REFC)                 |
 144 * +-------------------------------------------------+
 145 * | Free extent (C, B)                              | t5
 146 * | Done freeing extent (C, B)                      |
 147 * | Free extent (D, 1)                              |
 148 * | Done freeing extent (D, 1)                      |
 149 * | Free extent (F, 1)                              |
 150 * | Done freeing extent (F, 1)                      |
 151 * +-------------------------------------------------+
 152 *
 153 * If we should crash before t2 commits, log recovery replays
 154 * the following intent items:
 155 *
 156 * - Intent to reduce refcount for extent (C, B)
 157 * - Intent to remove rmap (X, C, A, B)
 158 * - Intent to free extent (D, 1) (bmbt block)
 159 * - Intent to increase refcount for extent (E, B)
 160 * - Intent to add rmap (X, E, A, B)
 161 *
 162 * In the process of recovering, it should also generate and take care
 163 * of these intent items:
 164 *
 165 * - Intent to free extent (C, B)
 166 * - Intent to free extent (F, 1) (refcountbt block)
 167 * - Intent to remove rmap (F, 1, REFC)
 168 *
 169 * Note that the continuation requested between t2 and t3 is likely to
 170 * reoccur.
 171 */
 172
 173static const struct xfs_defer_op_type *defer_op_types[] = {
 174        [XFS_DEFER_OPS_TYPE_BMAP]       = &xfs_bmap_update_defer_type,
 175        [XFS_DEFER_OPS_TYPE_REFCOUNT]   = &xfs_refcount_update_defer_type,
 176        [XFS_DEFER_OPS_TYPE_RMAP]       = &xfs_rmap_update_defer_type,
 177        [XFS_DEFER_OPS_TYPE_FREE]       = &xfs_extent_free_defer_type,
 178        [XFS_DEFER_OPS_TYPE_AGFL_FREE]  = &xfs_agfl_free_defer_type,
 179};
 180
 181/*
 182 * For each pending item in the intake list, log its intent item and the
 183 * associated extents, then add the entire intake list to the end of
 184 * the pending list.
 185 */
 186STATIC void
 187xfs_defer_create_intents(
 188        struct xfs_trans                *tp)
 189{
 190        struct list_head                *li;
 191        struct xfs_defer_pending        *dfp;
 192        const struct xfs_defer_op_type  *ops;
 193
 194        list_for_each_entry(dfp, &tp->t_dfops, dfp_list) {
 195                ops = defer_op_types[dfp->dfp_type];
 196                dfp->dfp_intent = ops->create_intent(tp, dfp->dfp_count);
 197                trace_xfs_defer_create_intent(tp->t_mountp, dfp);
 198                list_sort(tp->t_mountp, &dfp->dfp_work, ops->diff_items);
 199                list_for_each(li, &dfp->dfp_work)
 200                        ops->log_item(tp, dfp->dfp_intent, li);
 201        }
 202}
 203
 204/* Abort all the intents that were committed. */
 205STATIC void
 206xfs_defer_trans_abort(
 207        struct xfs_trans                *tp,
 208        struct list_head                *dop_pending)
 209{
 210        struct xfs_defer_pending        *dfp;
 211        const struct xfs_defer_op_type  *ops;
 212
 213        trace_xfs_defer_trans_abort(tp, _RET_IP_);
 214
 215        /* Abort intent items that don't have a done item. */
 216        list_for_each_entry(dfp, dop_pending, dfp_list) {
 217                ops = defer_op_types[dfp->dfp_type];
 218                trace_xfs_defer_pending_abort(tp->t_mountp, dfp);
 219                if (dfp->dfp_intent && !dfp->dfp_done) {
 220                        ops->abort_intent(dfp->dfp_intent);
 221                        dfp->dfp_intent = NULL;
 222                }
 223        }
 224}
 225
 226/* Roll a transaction so we can do some deferred op processing. */
 227STATIC int
 228xfs_defer_trans_roll(
 229        struct xfs_trans                **tpp)
 230{
 231        struct xfs_trans                *tp = *tpp;
 232        struct xfs_buf_log_item         *bli;
 233        struct xfs_inode_log_item       *ili;
 234        struct xfs_log_item             *lip;
 235        struct xfs_buf                  *bplist[XFS_DEFER_OPS_NR_BUFS];
 236        struct xfs_inode                *iplist[XFS_DEFER_OPS_NR_INODES];
 237        int                             bpcount = 0, ipcount = 0;
 238        int                             i;
 239        int                             error;
 240
 241        list_for_each_entry(lip, &tp->t_items, li_trans) {
 242                switch (lip->li_type) {
 243                case XFS_LI_BUF:
 244                        bli = container_of(lip, struct xfs_buf_log_item,
 245                                           bli_item);
 246                        if (bli->bli_flags & XFS_BLI_HOLD) {
 247                                if (bpcount >= XFS_DEFER_OPS_NR_BUFS) {
 248                                        ASSERT(0);
 249                                        return -EFSCORRUPTED;
 250                                }
 251                                xfs_trans_dirty_buf(tp, bli->bli_buf);
 252                                bplist[bpcount++] = bli->bli_buf;
 253                        }
 254                        break;
 255                case XFS_LI_INODE:
 256                        ili = container_of(lip, struct xfs_inode_log_item,
 257                                           ili_item);
 258                        if (ili->ili_lock_flags == 0) {
 259                                if (ipcount >= XFS_DEFER_OPS_NR_INODES) {
 260                                        ASSERT(0);
 261                                        return -EFSCORRUPTED;
 262                                }
 263                                xfs_trans_log_inode(tp, ili->ili_inode,
 264                                                    XFS_ILOG_CORE);
 265                                iplist[ipcount++] = ili->ili_inode;
 266                        }
 267                        break;
 268                default:
 269                        break;
 270                }
 271        }
 272
 273        trace_xfs_defer_trans_roll(tp, _RET_IP_);
 274
 275        /*
 276         * Roll the transaction.  Rolling always given a new transaction (even
 277         * if committing the old one fails!) to hand back to the caller, so we
 278         * join the held resources to the new transaction so that we always
 279         * return with the held resources joined to @tpp, no matter what
 280         * happened.
 281         */
 282        error = xfs_trans_roll(tpp);
 283        tp = *tpp;
 284
 285        /* Rejoin the joined inodes. */
 286        for (i = 0; i < ipcount; i++)
 287                xfs_trans_ijoin(tp, iplist[i], 0);
 288
 289        /* Rejoin the buffers and dirty them so the log moves forward. */
 290        for (i = 0; i < bpcount; i++) {
 291                xfs_trans_bjoin(tp, bplist[i]);
 292                xfs_trans_bhold(tp, bplist[i]);
 293        }
 294
 295        if (error)
 296                trace_xfs_defer_trans_roll_error(tp, error);
 297        return error;
 298}
 299
 300/*
 301 * Reset an already used dfops after finish.
 302 */
 303static void
 304xfs_defer_reset(
 305        struct xfs_trans        *tp)
 306{
 307        ASSERT(list_empty(&tp->t_dfops));
 308
 309        /*
 310         * Low mode state transfers across transaction rolls to mirror dfops
 311         * lifetime. Clear it now that dfops is reset.
 312         */
 313        tp->t_flags &= ~XFS_TRANS_LOWMODE;
 314}
 315
 316/*
 317 * Free up any items left in the list.
 318 */
 319static void
 320xfs_defer_cancel_list(
 321        struct xfs_mount                *mp,
 322        struct list_head                *dop_list)
 323{
 324        struct xfs_defer_pending        *dfp;
 325        struct xfs_defer_pending        *pli;
 326        struct list_head                *pwi;
 327        struct list_head                *n;
 328        const struct xfs_defer_op_type  *ops;
 329
 330        /*
 331         * Free the pending items.  Caller should already have arranged
 332         * for the intent items to be released.
 333         */
 334        list_for_each_entry_safe(dfp, pli, dop_list, dfp_list) {
 335                ops = defer_op_types[dfp->dfp_type];
 336                trace_xfs_defer_cancel_list(mp, dfp);
 337                list_del(&dfp->dfp_list);
 338                list_for_each_safe(pwi, n, &dfp->dfp_work) {
 339                        list_del(pwi);
 340                        dfp->dfp_count--;
 341                        ops->cancel_item(pwi);
 342                }
 343                ASSERT(dfp->dfp_count == 0);
 344                kmem_free(dfp);
 345        }
 346}
 347
 348/*
 349 * Finish all the pending work.  This involves logging intent items for
 350 * any work items that wandered in since the last transaction roll (if
 351 * one has even happened), rolling the transaction, and finishing the
 352 * work items in the first item on the logged-and-pending list.
 353 *
 354 * If an inode is provided, relog it to the new transaction.
 355 */
 356int
 357xfs_defer_finish_noroll(
 358        struct xfs_trans                **tp)
 359{
 360        struct xfs_defer_pending        *dfp;
 361        struct list_head                *li;
 362        struct list_head                *n;
 363        void                            *state;
 364        int                             error = 0;
 365        const struct xfs_defer_op_type  *ops;
 366        LIST_HEAD(dop_pending);
 367
 368        ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
 369
 370        trace_xfs_defer_finish(*tp, _RET_IP_);
 371
 372        /* Until we run out of pending work to finish... */
 373        while (!list_empty(&dop_pending) || !list_empty(&(*tp)->t_dfops)) {
 374                /* log intents and pull in intake items */
 375                xfs_defer_create_intents(*tp);
 376                list_splice_tail_init(&(*tp)->t_dfops, &dop_pending);
 377
 378                /*
 379                 * Roll the transaction.
 380                 */
 381                error = xfs_defer_trans_roll(tp);
 382                if (error)
 383                        goto out;
 384
 385                /* Log an intent-done item for the first pending item. */
 386                dfp = list_first_entry(&dop_pending, struct xfs_defer_pending,
 387                                       dfp_list);
 388                ops = defer_op_types[dfp->dfp_type];
 389                trace_xfs_defer_pending_finish((*tp)->t_mountp, dfp);
 390                dfp->dfp_done = ops->create_done(*tp, dfp->dfp_intent,
 391                                dfp->dfp_count);
 392
 393                /* Finish the work items. */
 394                state = NULL;
 395                list_for_each_safe(li, n, &dfp->dfp_work) {
 396                        list_del(li);
 397                        dfp->dfp_count--;
 398                        error = ops->finish_item(*tp, li, dfp->dfp_done,
 399                                        &state);
 400                        if (error == -EAGAIN) {
 401                                /*
 402                                 * Caller wants a fresh transaction;
 403                                 * put the work item back on the list
 404                                 * and jump out.
 405                                 */
 406                                list_add(li, &dfp->dfp_work);
 407                                dfp->dfp_count++;
 408                                break;
 409                        } else if (error) {
 410                                /*
 411                                 * Clean up after ourselves and jump out.
 412                                 * xfs_defer_cancel will take care of freeing
 413                                 * all these lists and stuff.
 414                                 */
 415                                if (ops->finish_cleanup)
 416                                        ops->finish_cleanup(*tp, state, error);
 417                                goto out;
 418                        }
 419                }
 420                if (error == -EAGAIN) {
 421                        /*
 422                         * Caller wants a fresh transaction, so log a
 423                         * new log intent item to replace the old one
 424                         * and roll the transaction.  See "Requesting
 425                         * a Fresh Transaction while Finishing
 426                         * Deferred Work" above.
 427                         */
 428                        dfp->dfp_intent = ops->create_intent(*tp,
 429                                        dfp->dfp_count);
 430                        dfp->dfp_done = NULL;
 431                        list_for_each(li, &dfp->dfp_work)
 432                                ops->log_item(*tp, dfp->dfp_intent, li);
 433                } else {
 434                        /* Done with the dfp, free it. */
 435                        list_del(&dfp->dfp_list);
 436                        kmem_free(dfp);
 437                }
 438
 439                if (ops->finish_cleanup)
 440                        ops->finish_cleanup(*tp, state, error);
 441        }
 442
 443out:
 444        if (error) {
 445                xfs_defer_trans_abort(*tp, &dop_pending);
 446                xfs_force_shutdown((*tp)->t_mountp, SHUTDOWN_CORRUPT_INCORE);
 447                trace_xfs_defer_finish_error(*tp, error);
 448                xfs_defer_cancel_list((*tp)->t_mountp, &dop_pending);
 449                xfs_defer_cancel(*tp);
 450                return error;
 451        }
 452
 453        trace_xfs_defer_finish_done(*tp, _RET_IP_);
 454        return 0;
 455}
 456
 457int
 458xfs_defer_finish(
 459        struct xfs_trans        **tp)
 460{
 461        int                     error;
 462
 463        /*
 464         * Finish and roll the transaction once more to avoid returning to the
 465         * caller with a dirty transaction.
 466         */
 467        error = xfs_defer_finish_noroll(tp);
 468        if (error)
 469                return error;
 470        if ((*tp)->t_flags & XFS_TRANS_DIRTY) {
 471                error = xfs_defer_trans_roll(tp);
 472                if (error) {
 473                        xfs_force_shutdown((*tp)->t_mountp,
 474                                           SHUTDOWN_CORRUPT_INCORE);
 475                        return error;
 476                }
 477        }
 478        xfs_defer_reset(*tp);
 479        return 0;
 480}
 481
 482void
 483xfs_defer_cancel(
 484        struct xfs_trans        *tp)
 485{
 486        struct xfs_mount        *mp = tp->t_mountp;
 487
 488        trace_xfs_defer_cancel(tp, _RET_IP_);
 489        xfs_defer_cancel_list(mp, &tp->t_dfops);
 490}
 491
 492/* Add an item for later deferred processing. */
 493void
 494xfs_defer_add(
 495        struct xfs_trans                *tp,
 496        enum xfs_defer_ops_type         type,
 497        struct list_head                *li)
 498{
 499        struct xfs_defer_pending        *dfp = NULL;
 500        const struct xfs_defer_op_type  *ops;
 501
 502        ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
 503        BUILD_BUG_ON(ARRAY_SIZE(defer_op_types) != XFS_DEFER_OPS_TYPE_MAX);
 504
 505        /*
 506         * Add the item to a pending item at the end of the intake list.
 507         * If the last pending item has the same type, reuse it.  Else,
 508         * create a new pending item at the end of the intake list.
 509         */
 510        if (!list_empty(&tp->t_dfops)) {
 511                dfp = list_last_entry(&tp->t_dfops,
 512                                struct xfs_defer_pending, dfp_list);
 513                ops = defer_op_types[dfp->dfp_type];
 514                if (dfp->dfp_type != type ||
 515                    (ops->max_items && dfp->dfp_count >= ops->max_items))
 516                        dfp = NULL;
 517        }
 518        if (!dfp) {
 519                dfp = kmem_alloc(sizeof(struct xfs_defer_pending),
 520                                KM_SLEEP | KM_NOFS);
 521                dfp->dfp_type = type;
 522                dfp->dfp_intent = NULL;
 523                dfp->dfp_done = NULL;
 524                dfp->dfp_count = 0;
 525                INIT_LIST_HEAD(&dfp->dfp_work);
 526                list_add_tail(&dfp->dfp_list, &tp->t_dfops);
 527        }
 528
 529        list_add_tail(li, &dfp->dfp_work);
 530        dfp->dfp_count++;
 531}
 532
 533/*
 534 * Move deferred ops from one transaction to another and reset the source to
 535 * initial state. This is primarily used to carry state forward across
 536 * transaction rolls with pending dfops.
 537 */
 538void
 539xfs_defer_move(
 540        struct xfs_trans        *dtp,
 541        struct xfs_trans        *stp)
 542{
 543        list_splice_init(&stp->t_dfops, &dtp->t_dfops);
 544
 545        /*
 546         * Low free space mode was historically controlled by a dfops field.
 547         * This meant that low mode state potentially carried across multiple
 548         * transaction rolls. Transfer low mode on a dfops move to preserve
 549         * that behavior.
 550         */
 551        dtp->t_flags |= (stp->t_flags & XFS_TRANS_LOWMODE);
 552
 553        xfs_defer_reset(stp);
 554}
 555