linux/fs/xfs/libxfs/xfs_defer.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2016 Oracle.  All Rights Reserved.
   3 *
   4 * Author: Darrick J. Wong <darrick.wong@oracle.com>
   5 *
   6 * This program is free software; you can redistribute it and/or
   7 * modify it under the terms of the GNU General Public License
   8 * as published by the Free Software Foundation; either version 2
   9 * of the License, or (at your option) any later version.
  10 *
  11 * This program is distributed in the hope that it would be useful,
  12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 * GNU General Public License for more details.
  15 *
  16 * You should have received a copy of the GNU General Public License
  17 * along with this program; if not, write the Free Software Foundation,
  18 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
  19 */
  20#include "xfs.h"
  21#include "xfs_fs.h"
  22#include "xfs_shared.h"
  23#include "xfs_format.h"
  24#include "xfs_log_format.h"
  25#include "xfs_trans_resv.h"
  26#include "xfs_bit.h"
  27#include "xfs_sb.h"
  28#include "xfs_mount.h"
  29#include "xfs_defer.h"
  30#include "xfs_trans.h"
  31#include "xfs_trace.h"
  32
  33/*
  34 * Deferred Operations in XFS
  35 *
  36 * Due to the way locking rules work in XFS, certain transactions (block
  37 * mapping and unmapping, typically) have permanent reservations so that
  38 * we can roll the transaction to adhere to AG locking order rules and
  39 * to unlock buffers between metadata updates.  Prior to rmap/reflink,
  40 * the mapping code had a mechanism to perform these deferrals for
  41 * extents that were going to be freed; this code makes that facility
  42 * more generic.
  43 *
  44 * When adding the reverse mapping and reflink features, it became
  45 * necessary to perform complex remapping multi-transactions to comply
  46 * with AG locking order rules, and to be able to spread a single
  47 * refcount update operation (an operation on an n-block extent can
  48 * update as many as n records!) among multiple transactions.  XFS can
  49 * roll a transaction to facilitate this, but using this facility
  50 * requires us to log "intent" items in case log recovery needs to
  51 * redo the operation, and to log "done" items to indicate that redo
  52 * is not necessary.
  53 *
  54 * Deferred work is tracked in xfs_defer_pending items.  Each pending
  55 * item tracks one type of deferred work.  Incoming work items (which
  56 * have not yet had an intent logged) are attached to a pending item
  57 * on the dop_intake list, where they wait for the caller to finish
  58 * the deferred operations.
  59 *
  60 * Finishing a set of deferred operations is an involved process.  To
  61 * start, we define "rolling a deferred-op transaction" as follows:
  62 *
  63 * > For each xfs_defer_pending item on the dop_intake list,
  64 *   - Sort the work items in AG order.  XFS locking
  65 *     order rules require us to lock buffers in AG order.
  66 *   - Create a log intent item for that type.
  67 *   - Attach it to the pending item.
  68 *   - Move the pending item from the dop_intake list to the
  69 *     dop_pending list.
  70 * > Roll the transaction.
  71 *
  72 * NOTE: To avoid exceeding the transaction reservation, we limit the
  73 * number of items that we attach to a given xfs_defer_pending.
  74 *
  75 * The actual finishing process looks like this:
  76 *
  77 * > For each xfs_defer_pending in the dop_pending list,
  78 *   - Roll the deferred-op transaction as above.
  79 *   - Create a log done item for that type, and attach it to the
  80 *     log intent item.
  81 *   - For each work item attached to the log intent item,
  82 *     * Perform the described action.
  83 *     * Attach the work item to the log done item.
  84 *     * If the result of doing the work was -EAGAIN, ->finish work
  85 *       wants a new transaction.  See the "Requesting a Fresh
  86 *       Transaction while Finishing Deferred Work" section below for
  87 *       details.
  88 *
  89 * The key here is that we must log an intent item for all pending
  90 * work items every time we roll the transaction, and that we must log
  91 * a done item as soon as the work is completed.  With this mechanism
  92 * we can perform complex remapping operations, chaining intent items
  93 * as needed.
  94 *
  95 * Requesting a Fresh Transaction while Finishing Deferred Work
  96 *
  97 * If ->finish_item decides that it needs a fresh transaction to
  98 * finish the work, it must ask its caller (xfs_defer_finish) for a
  99 * continuation.  The most likely cause of this circumstance are the
 100 * refcount adjust functions deciding that they've logged enough items
 101 * to be at risk of exceeding the transaction reservation.
 102 *
 103 * To get a fresh transaction, we want to log the existing log done
 104 * item to prevent the log intent item from replaying, immediately log
 105 * a new log intent item with the unfinished work items, roll the
 106 * transaction, and re-call ->finish_item wherever it left off.  The
 107 * log done item and the new log intent item must be in the same
 108 * transaction or atomicity cannot be guaranteed; defer_finish ensures
 109 * that this happens.
 110 *
 111 * This requires some coordination between ->finish_item and
 112 * defer_finish.  Upon deciding to request a new transaction,
 113 * ->finish_item should update the current work item to reflect the
 114 * unfinished work.  Next, it should reset the log done item's list
 115 * count to the number of items finished, and return -EAGAIN.
 116 * defer_finish sees the -EAGAIN, logs the new log intent item
 117 * with the remaining work items, and leaves the xfs_defer_pending
 118 * item at the head of the dop_work queue.  Then it rolls the
 119 * transaction and picks up processing where it left off.  It is
 120 * required that ->finish_item must be careful to leave enough
 121 * transaction reservation to fit the new log intent item.
 122 *
 123 * This is an example of remapping the extent (E, E+B) into file X at
 124 * offset A and dealing with the extent (C, C+B) already being mapped
 125 * there:
 126 * +-------------------------------------------------+
 127 * | Unmap file X startblock C offset A length B     | t0
 128 * | Intent to reduce refcount for extent (C, B)     |
 129 * | Intent to remove rmap (X, C, A, B)              |
 130 * | Intent to free extent (D, 1) (bmbt block)       |
 131 * | Intent to map (X, A, B) at startblock E         |
 132 * +-------------------------------------------------+
 133 * | Map file X startblock E offset A length B       | t1
 134 * | Done mapping (X, E, A, B)                       |
 135 * | Intent to increase refcount for extent (E, B)   |
 136 * | Intent to add rmap (X, E, A, B)                 |
 137 * +-------------------------------------------------+
 138 * | Reduce refcount for extent (C, B)               | t2
 139 * | Done reducing refcount for extent (C, 9)        |
 140 * | Intent to reduce refcount for extent (C+9, B-9) |
 141 * | (ran out of space after 9 refcount updates)     |
 142 * +-------------------------------------------------+
 143 * | Reduce refcount for extent (C+9, B+9)           | t3
 144 * | Done reducing refcount for extent (C+9, B-9)    |
 145 * | Increase refcount for extent (E, B)             |
 146 * | Done increasing refcount for extent (E, B)      |
 147 * | Intent to free extent (C, B)                    |
 148 * | Intent to free extent (F, 1) (refcountbt block) |
 149 * | Intent to remove rmap (F, 1, REFC)              |
 150 * +-------------------------------------------------+
 151 * | Remove rmap (X, C, A, B)                        | t4
 152 * | Done removing rmap (X, C, A, B)                 |
 153 * | Add rmap (X, E, A, B)                           |
 154 * | Done adding rmap (X, E, A, B)                   |
 155 * | Remove rmap (F, 1, REFC)                        |
 156 * | Done removing rmap (F, 1, REFC)                 |
 157 * +-------------------------------------------------+
 158 * | Free extent (C, B)                              | t5
 159 * | Done freeing extent (C, B)                      |
 160 * | Free extent (D, 1)                              |
 161 * | Done freeing extent (D, 1)                      |
 162 * | Free extent (F, 1)                              |
 163 * | Done freeing extent (F, 1)                      |
 164 * +-------------------------------------------------+
 165 *
 166 * If we should crash before t2 commits, log recovery replays
 167 * the following intent items:
 168 *
 169 * - Intent to reduce refcount for extent (C, B)
 170 * - Intent to remove rmap (X, C, A, B)
 171 * - Intent to free extent (D, 1) (bmbt block)
 172 * - Intent to increase refcount for extent (E, B)
 173 * - Intent to add rmap (X, E, A, B)
 174 *
 175 * In the process of recovering, it should also generate and take care
 176 * of these intent items:
 177 *
 178 * - Intent to free extent (C, B)
 179 * - Intent to free extent (F, 1) (refcountbt block)
 180 * - Intent to remove rmap (F, 1, REFC)
 181 *
 182 * Note that the continuation requested between t2 and t3 is likely to
 183 * reoccur.
 184 */
 185
 186static const struct xfs_defer_op_type *defer_op_types[XFS_DEFER_OPS_TYPE_MAX];
 187
 188/*
 189 * For each pending item in the intake list, log its intent item and the
 190 * associated extents, then add the entire intake list to the end of
 191 * the pending list.
 192 */
 193STATIC void
 194xfs_defer_intake_work(
 195        struct xfs_trans                *tp,
 196        struct xfs_defer_ops            *dop)
 197{
 198        struct list_head                *li;
 199        struct xfs_defer_pending        *dfp;
 200
 201        list_for_each_entry(dfp, &dop->dop_intake, dfp_list) {
 202                dfp->dfp_intent = dfp->dfp_type->create_intent(tp,
 203                                dfp->dfp_count);
 204                trace_xfs_defer_intake_work(tp->t_mountp, dfp);
 205                list_sort(tp->t_mountp, &dfp->dfp_work,
 206                                dfp->dfp_type->diff_items);
 207                list_for_each(li, &dfp->dfp_work)
 208                        dfp->dfp_type->log_item(tp, dfp->dfp_intent, li);
 209        }
 210
 211        list_splice_tail_init(&dop->dop_intake, &dop->dop_pending);
 212}
 213
 214/* Abort all the intents that were committed. */
 215STATIC void
 216xfs_defer_trans_abort(
 217        struct xfs_trans                *tp,
 218        struct xfs_defer_ops            *dop,
 219        int                             error)
 220{
 221        struct xfs_defer_pending        *dfp;
 222
 223        trace_xfs_defer_trans_abort(tp->t_mountp, dop);
 224
 225        /* Abort intent items that don't have a done item. */
 226        list_for_each_entry(dfp, &dop->dop_pending, dfp_list) {
 227                trace_xfs_defer_pending_abort(tp->t_mountp, dfp);
 228                if (dfp->dfp_intent && !dfp->dfp_done) {
 229                        dfp->dfp_type->abort_intent(dfp->dfp_intent);
 230                        dfp->dfp_intent = NULL;
 231                }
 232        }
 233
 234        /* Shut down FS. */
 235        xfs_force_shutdown(tp->t_mountp, (error == -EFSCORRUPTED) ?
 236                        SHUTDOWN_CORRUPT_INCORE : SHUTDOWN_META_IO_ERROR);
 237}
 238
 239/* Roll a transaction so we can do some deferred op processing. */
 240STATIC int
 241xfs_defer_trans_roll(
 242        struct xfs_trans                **tp,
 243        struct xfs_defer_ops            *dop,
 244        struct xfs_inode                *ip)
 245{
 246        int                             i;
 247        int                             error;
 248
 249        /* Log all the joined inodes except the one we passed in. */
 250        for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++) {
 251                if (dop->dop_inodes[i] == ip)
 252                        continue;
 253                xfs_trans_log_inode(*tp, dop->dop_inodes[i], XFS_ILOG_CORE);
 254        }
 255
 256        trace_xfs_defer_trans_roll((*tp)->t_mountp, dop);
 257
 258        /* Roll the transaction. */
 259        error = xfs_trans_roll(tp, ip);
 260        if (error) {
 261                trace_xfs_defer_trans_roll_error((*tp)->t_mountp, dop, error);
 262                xfs_defer_trans_abort(*tp, dop, error);
 263                return error;
 264        }
 265        dop->dop_committed = true;
 266
 267        /* Rejoin the joined inodes except the one we passed in. */
 268        for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++) {
 269                if (dop->dop_inodes[i] == ip)
 270                        continue;
 271                xfs_trans_ijoin(*tp, dop->dop_inodes[i], 0);
 272        }
 273
 274        return error;
 275}
 276
 277/* Do we have any work items to finish? */
 278bool
 279xfs_defer_has_unfinished_work(
 280        struct xfs_defer_ops            *dop)
 281{
 282        return !list_empty(&dop->dop_pending) || !list_empty(&dop->dop_intake);
 283}
 284
 285/*
 286 * Add this inode to the deferred op.  Each joined inode is relogged
 287 * each time we roll the transaction, in addition to any inode passed
 288 * to xfs_defer_finish().
 289 */
 290int
 291xfs_defer_join(
 292        struct xfs_defer_ops            *dop,
 293        struct xfs_inode                *ip)
 294{
 295        int                             i;
 296
 297        for (i = 0; i < XFS_DEFER_OPS_NR_INODES; i++) {
 298                if (dop->dop_inodes[i] == ip)
 299                        return 0;
 300                else if (dop->dop_inodes[i] == NULL) {
 301                        dop->dop_inodes[i] = ip;
 302                        return 0;
 303                }
 304        }
 305
 306        return -EFSCORRUPTED;
 307}
 308
 309/*
 310 * Finish all the pending work.  This involves logging intent items for
 311 * any work items that wandered in since the last transaction roll (if
 312 * one has even happened), rolling the transaction, and finishing the
 313 * work items in the first item on the logged-and-pending list.
 314 *
 315 * If an inode is provided, relog it to the new transaction.
 316 */
 317int
 318xfs_defer_finish(
 319        struct xfs_trans                **tp,
 320        struct xfs_defer_ops            *dop,
 321        struct xfs_inode                *ip)
 322{
 323        struct xfs_defer_pending        *dfp;
 324        struct list_head                *li;
 325        struct list_head                *n;
 326        void                            *state;
 327        int                             error = 0;
 328        void                            (*cleanup_fn)(struct xfs_trans *, void *, int);
 329
 330        ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
 331
 332        trace_xfs_defer_finish((*tp)->t_mountp, dop);
 333
 334        /* Until we run out of pending work to finish... */
 335        while (xfs_defer_has_unfinished_work(dop)) {
 336                /* Log intents for work items sitting in the intake. */
 337                xfs_defer_intake_work(*tp, dop);
 338
 339                /* Roll the transaction. */
 340                error = xfs_defer_trans_roll(tp, dop, ip);
 341                if (error)
 342                        goto out;
 343
 344                /* Log an intent-done item for the first pending item. */
 345                dfp = list_first_entry(&dop->dop_pending,
 346                                struct xfs_defer_pending, dfp_list);
 347                trace_xfs_defer_pending_finish((*tp)->t_mountp, dfp);
 348                dfp->dfp_done = dfp->dfp_type->create_done(*tp, dfp->dfp_intent,
 349                                dfp->dfp_count);
 350                cleanup_fn = dfp->dfp_type->finish_cleanup;
 351
 352                /* Finish the work items. */
 353                state = NULL;
 354                list_for_each_safe(li, n, &dfp->dfp_work) {
 355                        list_del(li);
 356                        dfp->dfp_count--;
 357                        error = dfp->dfp_type->finish_item(*tp, dop, li,
 358                                        dfp->dfp_done, &state);
 359                        if (error == -EAGAIN) {
 360                                /*
 361                                 * Caller wants a fresh transaction;
 362                                 * put the work item back on the list
 363                                 * and jump out.
 364                                 */
 365                                list_add(li, &dfp->dfp_work);
 366                                dfp->dfp_count++;
 367                                break;
 368                        } else if (error) {
 369                                /*
 370                                 * Clean up after ourselves and jump out.
 371                                 * xfs_defer_cancel will take care of freeing
 372                                 * all these lists and stuff.
 373                                 */
 374                                if (cleanup_fn)
 375                                        cleanup_fn(*tp, state, error);
 376                                xfs_defer_trans_abort(*tp, dop, error);
 377                                goto out;
 378                        }
 379                }
 380                if (error == -EAGAIN) {
 381                        /*
 382                         * Caller wants a fresh transaction, so log a
 383                         * new log intent item to replace the old one
 384                         * and roll the transaction.  See "Requesting
 385                         * a Fresh Transaction while Finishing
 386                         * Deferred Work" above.
 387                         */
 388                        dfp->dfp_intent = dfp->dfp_type->create_intent(*tp,
 389                                        dfp->dfp_count);
 390                        dfp->dfp_done = NULL;
 391                        list_for_each(li, &dfp->dfp_work)
 392                                dfp->dfp_type->log_item(*tp, dfp->dfp_intent,
 393                                                li);
 394                } else {
 395                        /* Done with the dfp, free it. */
 396                        list_del(&dfp->dfp_list);
 397                        kmem_free(dfp);
 398                }
 399
 400                if (cleanup_fn)
 401                        cleanup_fn(*tp, state, error);
 402        }
 403
 404out:
 405        if (error)
 406                trace_xfs_defer_finish_error((*tp)->t_mountp, dop, error);
 407        else
 408                trace_xfs_defer_finish_done((*tp)->t_mountp, dop);
 409        return error;
 410}
 411
 412/*
 413 * Free up any items left in the list.
 414 */
 415void
 416xfs_defer_cancel(
 417        struct xfs_defer_ops            *dop)
 418{
 419        struct xfs_defer_pending        *dfp;
 420        struct xfs_defer_pending        *pli;
 421        struct list_head                *pwi;
 422        struct list_head                *n;
 423
 424        trace_xfs_defer_cancel(NULL, dop);
 425
 426        /*
 427         * Free the pending items.  Caller should already have arranged
 428         * for the intent items to be released.
 429         */
 430        list_for_each_entry_safe(dfp, pli, &dop->dop_intake, dfp_list) {
 431                trace_xfs_defer_intake_cancel(NULL, dfp);
 432                list_del(&dfp->dfp_list);
 433                list_for_each_safe(pwi, n, &dfp->dfp_work) {
 434                        list_del(pwi);
 435                        dfp->dfp_count--;
 436                        dfp->dfp_type->cancel_item(pwi);
 437                }
 438                ASSERT(dfp->dfp_count == 0);
 439                kmem_free(dfp);
 440        }
 441        list_for_each_entry_safe(dfp, pli, &dop->dop_pending, dfp_list) {
 442                trace_xfs_defer_pending_cancel(NULL, dfp);
 443                list_del(&dfp->dfp_list);
 444                list_for_each_safe(pwi, n, &dfp->dfp_work) {
 445                        list_del(pwi);
 446                        dfp->dfp_count--;
 447                        dfp->dfp_type->cancel_item(pwi);
 448                }
 449                ASSERT(dfp->dfp_count == 0);
 450                kmem_free(dfp);
 451        }
 452}
 453
 454/* Add an item for later deferred processing. */
 455void
 456xfs_defer_add(
 457        struct xfs_defer_ops            *dop,
 458        enum xfs_defer_ops_type         type,
 459        struct list_head                *li)
 460{
 461        struct xfs_defer_pending        *dfp = NULL;
 462
 463        /*
 464         * Add the item to a pending item at the end of the intake list.
 465         * If the last pending item has the same type, reuse it.  Else,
 466         * create a new pending item at the end of the intake list.
 467         */
 468        if (!list_empty(&dop->dop_intake)) {
 469                dfp = list_last_entry(&dop->dop_intake,
 470                                struct xfs_defer_pending, dfp_list);
 471                if (dfp->dfp_type->type != type ||
 472                    (dfp->dfp_type->max_items &&
 473                     dfp->dfp_count >= dfp->dfp_type->max_items))
 474                        dfp = NULL;
 475        }
 476        if (!dfp) {
 477                dfp = kmem_alloc(sizeof(struct xfs_defer_pending),
 478                                KM_SLEEP | KM_NOFS);
 479                dfp->dfp_type = defer_op_types[type];
 480                dfp->dfp_intent = NULL;
 481                dfp->dfp_done = NULL;
 482                dfp->dfp_count = 0;
 483                INIT_LIST_HEAD(&dfp->dfp_work);
 484                list_add_tail(&dfp->dfp_list, &dop->dop_intake);
 485        }
 486
 487        list_add_tail(li, &dfp->dfp_work);
 488        dfp->dfp_count++;
 489}
 490
 491/* Initialize a deferred operation list. */
 492void
 493xfs_defer_init_op_type(
 494        const struct xfs_defer_op_type  *type)
 495{
 496        defer_op_types[type->type] = type;
 497}
 498
 499/* Initialize a deferred operation. */
 500void
 501xfs_defer_init(
 502        struct xfs_defer_ops            *dop,
 503        xfs_fsblock_t                   *fbp)
 504{
 505        dop->dop_committed = false;
 506        dop->dop_low = false;
 507        memset(&dop->dop_inodes, 0, sizeof(dop->dop_inodes));
 508        *fbp = NULLFSBLOCK;
 509        INIT_LIST_HEAD(&dop->dop_intake);
 510        INIT_LIST_HEAD(&dop->dop_pending);
 511        trace_xfs_defer_init(NULL, dop);
 512}
 513