linux/fs/ext4/fast_commit.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2
   3/*
   4 * fs/ext4/fast_commit.c
   5 *
   6 * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
   7 *
   8 * Ext4 fast commits routines.
   9 */
  10#include "ext4.h"
  11#include "ext4_jbd2.h"
  12#include "ext4_extents.h"
  13#include "mballoc.h"
  14
  15/*
  16 * Ext4 Fast Commits
  17 * -----------------
  18 *
  19 * Ext4 fast commits implement fine grained journalling for Ext4.
  20 *
  21 * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
  22 * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
  23 * TLV during the recovery phase. For the scenarios for which we currently
  24 * don't have replay code, fast commit falls back to full commits.
  25 * Fast commits record delta in one of the following three categories.
  26 *
  27 * (A) Directory entry updates:
  28 *
  29 * - EXT4_FC_TAG_UNLINK         - records directory entry unlink
  30 * - EXT4_FC_TAG_LINK           - records directory entry link
  31 * - EXT4_FC_TAG_CREAT          - records inode and directory entry creation
  32 *
  33 * (B) File specific data range updates:
  34 *
  35 * - EXT4_FC_TAG_ADD_RANGE      - records addition of new blocks to an inode
  36 * - EXT4_FC_TAG_DEL_RANGE      - records deletion of blocks from an inode
  37 *
  38 * (C) Inode metadata (mtime / ctime etc):
  39 *
  40 * - EXT4_FC_TAG_INODE          - record the inode that should be replayed
  41 *                                during recovery. Note that iblocks field is
  42 *                                not replayed and instead derived during
  43 *                                replay.
  44 * Commit Operation
  45 * ----------------
  46 * With fast commits, we maintain all the directory entry operations in the
  47 * order in which they are issued in an in-memory queue. This queue is flushed
  48 * to disk during the commit operation. We also maintain a list of inodes
  49 * that need to be committed during a fast commit in another in memory queue of
  50 * inodes. During the commit operation, we commit in the following order:
  51 *
  52 * [1] Lock inodes for any further data updates by setting COMMITTING state
  53 * [2] Submit data buffers of all the inodes
  54 * [3] Wait for [2] to complete
  55 * [4] Commit all the directory entry updates in the fast commit space
  56 * [5] Commit all the changed inode structures
  57 * [6] Write tail tag (this tag ensures the atomicity, please read the following
  58 *     section for more details).
  59 * [7] Wait for [4], [5] and [6] to complete.
  60 *
  61 * All the inode updates must call ext4_fc_start_update() before starting an
  62 * update. If such an ongoing update is present, fast commit waits for it to
  63 * complete. The completion of such an update is marked by
  64 * ext4_fc_stop_update().
  65 *
  66 * Fast Commit Ineligibility
  67 * -------------------------
  68 * Not all operations are supported by fast commits today (e.g extended
  69 * attributes). Fast commit ineligiblity is marked by calling one of the
  70 * two following functions:
  71 *
  72 * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall
  73 *   back to full commit. This is useful in case of transient errors.
  74 *
  75 * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all
  76 *   the fast commits happening between ext4_fc_start_ineligible() and
  77 *   ext4_fc_stop_ineligible() and one fast commit after the call to
  78 *   ext4_fc_stop_ineligible() to fall back to full commits. It is important to
  79 *   make one more fast commit to fall back to full commit after stop call so
  80 *   that it guaranteed that the fast commit ineligible operation contained
  81 *   within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is
  82 *   followed by at least 1 full commit.
  83 *
  84 * Atomicity of commits
  85 * --------------------
  86 * In order to guarantee atomicity during the commit operation, fast commit
  87 * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
  88 * tag contains CRC of the contents and TID of the transaction after which
  89 * this fast commit should be applied. Recovery code replays fast commit
  90 * logs only if there's at least 1 valid tail present. For every fast commit
  91 * operation, there is 1 tail. This means, we may end up with multiple tails
  92 * in the fast commit space. Here's an example:
  93 *
  94 * - Create a new file A and remove existing file B
  95 * - fsync()
  96 * - Append contents to file A
  97 * - Truncate file A
  98 * - fsync()
  99 *
 100 * The fast commit space at the end of above operations would look like this:
 101 *      [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
 102 *             |<---  Fast Commit 1   --->|<---      Fast Commit 2     ---->|
 103 *
 104 * Replay code should thus check for all the valid tails in the FC area.
 105 *
 106 * Fast Commit Replay Idempotence
 107 * ------------------------------
 108 *
 109 * Fast commits tags are idempotent in nature provided the recovery code follows
 110 * certain rules. The guiding principle that the commit path follows while
 111 * committing is that it stores the result of a particular operation instead of
 112 * storing the procedure.
 113 *
 114 * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
 115 * was associated with inode 10. During fast commit, instead of storing this
 116 * operation as a procedure "rename a to b", we store the resulting file system
 117 * state as a "series" of outcomes:
 118 *
 119 * - Link dirent b to inode 10
 120 * - Unlink dirent a
 121 * - Inode <10> with valid refcount
 122 *
 123 * Now when recovery code runs, it needs "enforce" this state on the file
 124 * system. This is what guarantees idempotence of fast commit replay.
 125 *
 126 * Let's take an example of a procedure that is not idempotent and see how fast
 127 * commits make it idempotent. Consider following sequence of operations:
 128 *
 129 *     rm A;    mv B A;    read A
 130 *  (x)     (y)        (z)
 131 *
 132 * (x), (y) and (z) are the points at which we can crash. If we store this
 133 * sequence of operations as is then the replay is not idempotent. Let's say
 134 * while in replay, we crash at (z). During the second replay, file A (which was
 135 * actually created as a result of "mv B A" operation) would get deleted. Thus,
 136 * file named A would be absent when we try to read A. So, this sequence of
 137 * operations is not idempotent. However, as mentioned above, instead of storing
 138 * the procedure fast commits store the outcome of each procedure. Thus the fast
 139 * commit log for above procedure would be as follows:
 140 *
 141 * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
 142 * inode 11 before the replay)
 143 *
 144 *    [Unlink A]   [Link A to inode 11]   [Unlink B]   [Inode 11]
 145 * (w)          (x)                    (y)          (z)
 146 *
 147 * If we crash at (z), we will have file A linked to inode 11. During the second
 148 * replay, we will remove file A (inode 11). But we will create it back and make
 149 * it point to inode 11. We won't find B, so we'll just skip that step. At this
 150 * point, the refcount for inode 11 is not reliable, but that gets fixed by the
 151 * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
 152 * similarly. Thus, by converting a non-idempotent procedure into a series of
 153 * idempotent outcomes, fast commits ensured idempotence during the replay.
 154 *
 155 * TODOs
 156 * -----
 157 *
 158 * 0) Fast commit replay path hardening: Fast commit replay code should use
 159 *    journal handles to make sure all the updates it does during the replay
 160 *    path are atomic. With that if we crash during fast commit replay, after
 161 *    trying to do recovery again, we will find a file system where fast commit
 162 *    area is invalid (because new full commit would be found). In order to deal
 163 *    with that, fast commit replay code should ensure that the "FC_REPLAY"
 164 *    superblock state is persisted before starting the replay, so that after
 165 *    the crash, fast commit recovery code can look at that flag and perform
 166 *    fast commit recovery even if that area is invalidated by later full
 167 *    commits.
 168 *
 169 * 1) Make fast commit atomic updates more fine grained. Today, a fast commit
 170 *    eligible update must be protected within ext4_fc_start_update() and
 171 *    ext4_fc_stop_update(). These routines are called at much higher
 172 *    routines. This can be made more fine grained by combining with
 173 *    ext4_journal_start().
 174 *
 175 * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible()
 176 *
 177 * 3) Handle more ineligible cases.
 178 */
 179
 180#include <trace/events/ext4.h>
 181static struct kmem_cache *ext4_fc_dentry_cachep;
 182
 183static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 184{
 185        BUFFER_TRACE(bh, "");
 186        if (uptodate) {
 187                ext4_debug("%s: Block %lld up-to-date",
 188                           __func__, bh->b_blocknr);
 189                set_buffer_uptodate(bh);
 190        } else {
 191                ext4_debug("%s: Block %lld not up-to-date",
 192                           __func__, bh->b_blocknr);
 193                clear_buffer_uptodate(bh);
 194        }
 195
 196        unlock_buffer(bh);
 197}
 198
 199static inline void ext4_fc_reset_inode(struct inode *inode)
 200{
 201        struct ext4_inode_info *ei = EXT4_I(inode);
 202
 203        ei->i_fc_lblk_start = 0;
 204        ei->i_fc_lblk_len = 0;
 205}
 206
 207void ext4_fc_init_inode(struct inode *inode)
 208{
 209        struct ext4_inode_info *ei = EXT4_I(inode);
 210
 211        ext4_fc_reset_inode(inode);
 212        ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
 213        INIT_LIST_HEAD(&ei->i_fc_list);
 214        init_waitqueue_head(&ei->i_fc_wait);
 215        atomic_set(&ei->i_fc_updates, 0);
 216}
 217
 218/* This function must be called with sbi->s_fc_lock held. */
 219static void ext4_fc_wait_committing_inode(struct inode *inode)
 220__releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
 221{
 222        wait_queue_head_t *wq;
 223        struct ext4_inode_info *ei = EXT4_I(inode);
 224
 225#if (BITS_PER_LONG < 64)
 226        DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
 227                        EXT4_STATE_FC_COMMITTING);
 228        wq = bit_waitqueue(&ei->i_state_flags,
 229                                EXT4_STATE_FC_COMMITTING);
 230#else
 231        DEFINE_WAIT_BIT(wait, &ei->i_flags,
 232                        EXT4_STATE_FC_COMMITTING);
 233        wq = bit_waitqueue(&ei->i_flags,
 234                                EXT4_STATE_FC_COMMITTING);
 235#endif
 236        lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
 237        prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
 238        spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 239        schedule();
 240        finish_wait(wq, &wait.wq_entry);
 241}
 242
 243/*
 244 * Inform Ext4's fast about start of an inode update
 245 *
 246 * This function is called by the high level call VFS callbacks before
 247 * performing any inode update. This function blocks if there's an ongoing
 248 * fast commit on the inode in question.
 249 */
 250void ext4_fc_start_update(struct inode *inode)
 251{
 252        struct ext4_inode_info *ei = EXT4_I(inode);
 253
 254        if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 255            (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 256                return;
 257
 258restart:
 259        spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 260        if (list_empty(&ei->i_fc_list))
 261                goto out;
 262
 263        if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
 264                ext4_fc_wait_committing_inode(inode);
 265                goto restart;
 266        }
 267out:
 268        atomic_inc(&ei->i_fc_updates);
 269        spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 270}
 271
 272/*
 273 * Stop inode update and wake up waiting fast commits if any.
 274 */
 275void ext4_fc_stop_update(struct inode *inode)
 276{
 277        struct ext4_inode_info *ei = EXT4_I(inode);
 278
 279        if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 280            (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 281                return;
 282
 283        if (atomic_dec_and_test(&ei->i_fc_updates))
 284                wake_up_all(&ei->i_fc_wait);
 285}
 286
 287/*
 288 * Remove inode from fast commit list. If the inode is being committed
 289 * we wait until inode commit is done.
 290 */
 291void ext4_fc_del(struct inode *inode)
 292{
 293        struct ext4_inode_info *ei = EXT4_I(inode);
 294
 295        if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 296            (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 297                return;
 298
 299restart:
 300        spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 301        if (list_empty(&ei->i_fc_list)) {
 302                spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 303                return;
 304        }
 305
 306        if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
 307                ext4_fc_wait_committing_inode(inode);
 308                goto restart;
 309        }
 310        list_del_init(&ei->i_fc_list);
 311        spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 312}
 313
 314/*
 315 * Mark file system as fast commit ineligible. This means that next commit
 316 * operation would result in a full jbd2 commit.
 317 */
 318void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
 319{
 320        struct ext4_sb_info *sbi = EXT4_SB(sb);
 321
 322        if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
 323            (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
 324                return;
 325
 326        ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
 327        WARN_ON(reason >= EXT4_FC_REASON_MAX);
 328        sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
 329}
 330
 331/*
 332 * Start a fast commit ineligible update. Any commits that happen while
 333 * such an operation is in progress fall back to full commits.
 334 */
 335void ext4_fc_start_ineligible(struct super_block *sb, int reason)
 336{
 337        struct ext4_sb_info *sbi = EXT4_SB(sb);
 338
 339        if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
 340            (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
 341                return;
 342
 343        WARN_ON(reason >= EXT4_FC_REASON_MAX);
 344        sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
 345        atomic_inc(&sbi->s_fc_ineligible_updates);
 346}
 347
 348/*
 349 * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here
 350 * to ensure that after stopping the ineligible update, at least one full
 351 * commit takes place.
 352 */
 353void ext4_fc_stop_ineligible(struct super_block *sb)
 354{
 355        if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
 356            (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
 357                return;
 358
 359        ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
 360        atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
 361}
 362
 363static inline int ext4_fc_is_ineligible(struct super_block *sb)
 364{
 365        return (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE) ||
 366                atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates));
 367}
 368
 369/*
 370 * Generic fast commit tracking function. If this is the first time this we are
 371 * called after a full commit, we initialize fast commit fields and then call
 372 * __fc_track_fn() with update = 0. If we have already been called after a full
 373 * commit, we pass update = 1. Based on that, the track function can determine
 374 * if it needs to track a field for the first time or if it needs to just
 375 * update the previously tracked value.
 376 *
 377 * If enqueue is set, this function enqueues the inode in fast commit list.
 378 */
 379static int ext4_fc_track_template(
 380        handle_t *handle, struct inode *inode,
 381        int (*__fc_track_fn)(struct inode *, void *, bool),
 382        void *args, int enqueue)
 383{
 384        bool update = false;
 385        struct ext4_inode_info *ei = EXT4_I(inode);
 386        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 387        tid_t tid = 0;
 388        int ret;
 389
 390        if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 391            (sbi->s_mount_state & EXT4_FC_REPLAY))
 392                return -EOPNOTSUPP;
 393
 394        if (ext4_fc_is_ineligible(inode->i_sb))
 395                return -EINVAL;
 396
 397        tid = handle->h_transaction->t_tid;
 398        mutex_lock(&ei->i_fc_lock);
 399        if (tid == ei->i_sync_tid) {
 400                update = true;
 401        } else {
 402                ext4_fc_reset_inode(inode);
 403                ei->i_sync_tid = tid;
 404        }
 405        ret = __fc_track_fn(inode, args, update);
 406        mutex_unlock(&ei->i_fc_lock);
 407
 408        if (!enqueue)
 409                return ret;
 410
 411        spin_lock(&sbi->s_fc_lock);
 412        if (list_empty(&EXT4_I(inode)->i_fc_list))
 413                list_add_tail(&EXT4_I(inode)->i_fc_list,
 414                                (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ?
 415                                &sbi->s_fc_q[FC_Q_STAGING] :
 416                                &sbi->s_fc_q[FC_Q_MAIN]);
 417        spin_unlock(&sbi->s_fc_lock);
 418
 419        return ret;
 420}
 421
 422struct __track_dentry_update_args {
 423        struct dentry *dentry;
 424        int op;
 425};
 426
 427/* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
 428static int __track_dentry_update(struct inode *inode, void *arg, bool update)
 429{
 430        struct ext4_fc_dentry_update *node;
 431        struct ext4_inode_info *ei = EXT4_I(inode);
 432        struct __track_dentry_update_args *dentry_update =
 433                (struct __track_dentry_update_args *)arg;
 434        struct dentry *dentry = dentry_update->dentry;
 435        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 436
 437        mutex_unlock(&ei->i_fc_lock);
 438        node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
 439        if (!node) {
 440                ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
 441                mutex_lock(&ei->i_fc_lock);
 442                return -ENOMEM;
 443        }
 444
 445        node->fcd_op = dentry_update->op;
 446        node->fcd_parent = dentry->d_parent->d_inode->i_ino;
 447        node->fcd_ino = inode->i_ino;
 448        if (dentry->d_name.len > DNAME_INLINE_LEN) {
 449                node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
 450                if (!node->fcd_name.name) {
 451                        kmem_cache_free(ext4_fc_dentry_cachep, node);
 452                        ext4_fc_mark_ineligible(inode->i_sb,
 453                                EXT4_FC_REASON_NOMEM);
 454                        mutex_lock(&ei->i_fc_lock);
 455                        return -ENOMEM;
 456                }
 457                memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
 458                        dentry->d_name.len);
 459        } else {
 460                memcpy(node->fcd_iname, dentry->d_name.name,
 461                        dentry->d_name.len);
 462                node->fcd_name.name = node->fcd_iname;
 463        }
 464        node->fcd_name.len = dentry->d_name.len;
 465
 466        spin_lock(&sbi->s_fc_lock);
 467        if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING))
 468                list_add_tail(&node->fcd_list,
 469                                &sbi->s_fc_dentry_q[FC_Q_STAGING]);
 470        else
 471                list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
 472        spin_unlock(&sbi->s_fc_lock);
 473        mutex_lock(&ei->i_fc_lock);
 474
 475        return 0;
 476}
 477
 478void __ext4_fc_track_unlink(handle_t *handle,
 479                struct inode *inode, struct dentry *dentry)
 480{
 481        struct __track_dentry_update_args args;
 482        int ret;
 483
 484        args.dentry = dentry;
 485        args.op = EXT4_FC_TAG_UNLINK;
 486
 487        ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 488                                        (void *)&args, 0);
 489        trace_ext4_fc_track_unlink(inode, dentry, ret);
 490}
 491
 492void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
 493{
 494        __ext4_fc_track_unlink(handle, d_inode(dentry), dentry);
 495}
 496
 497void __ext4_fc_track_link(handle_t *handle,
 498        struct inode *inode, struct dentry *dentry)
 499{
 500        struct __track_dentry_update_args args;
 501        int ret;
 502
 503        args.dentry = dentry;
 504        args.op = EXT4_FC_TAG_LINK;
 505
 506        ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 507                                        (void *)&args, 0);
 508        trace_ext4_fc_track_link(inode, dentry, ret);
 509}
 510
 511void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
 512{
 513        __ext4_fc_track_link(handle, d_inode(dentry), dentry);
 514}
 515
 516void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
 517{
 518        struct __track_dentry_update_args args;
 519        struct inode *inode = d_inode(dentry);
 520        int ret;
 521
 522        args.dentry = dentry;
 523        args.op = EXT4_FC_TAG_CREAT;
 524
 525        ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 526                                        (void *)&args, 0);
 527        trace_ext4_fc_track_create(inode, dentry, ret);
 528}
 529
 530/* __track_fn for inode tracking */
 531static int __track_inode(struct inode *inode, void *arg, bool update)
 532{
 533        if (update)
 534                return -EEXIST;
 535
 536        EXT4_I(inode)->i_fc_lblk_len = 0;
 537
 538        return 0;
 539}
 540
 541void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
 542{
 543        int ret;
 544
 545        if (S_ISDIR(inode->i_mode))
 546                return;
 547
 548        if (ext4_should_journal_data(inode)) {
 549                ext4_fc_mark_ineligible(inode->i_sb,
 550                                        EXT4_FC_REASON_INODE_JOURNAL_DATA);
 551                return;
 552        }
 553
 554        ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
 555        trace_ext4_fc_track_inode(inode, ret);
 556}
 557
 558struct __track_range_args {
 559        ext4_lblk_t start, end;
 560};
 561
 562/* __track_fn for tracking data updates */
 563static int __track_range(struct inode *inode, void *arg, bool update)
 564{
 565        struct ext4_inode_info *ei = EXT4_I(inode);
 566        ext4_lblk_t oldstart;
 567        struct __track_range_args *__arg =
 568                (struct __track_range_args *)arg;
 569
 570        if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
 571                ext4_debug("Special inode %ld being modified\n", inode->i_ino);
 572                return -ECANCELED;
 573        }
 574
 575        oldstart = ei->i_fc_lblk_start;
 576
 577        if (update && ei->i_fc_lblk_len > 0) {
 578                ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
 579                ei->i_fc_lblk_len =
 580                        max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
 581                                ei->i_fc_lblk_start + 1;
 582        } else {
 583                ei->i_fc_lblk_start = __arg->start;
 584                ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
 585        }
 586
 587        return 0;
 588}
 589
 590void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
 591                         ext4_lblk_t end)
 592{
 593        struct __track_range_args args;
 594        int ret;
 595
 596        if (S_ISDIR(inode->i_mode))
 597                return;
 598
 599        args.start = start;
 600        args.end = end;
 601
 602        ret = ext4_fc_track_template(handle, inode,  __track_range, &args, 1);
 603
 604        trace_ext4_fc_track_range(inode, start, end, ret);
 605}
 606
 607static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
 608{
 609        int write_flags = REQ_SYNC;
 610        struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
 611
 612        /* Add REQ_FUA | REQ_PREFLUSH only its tail */
 613        if (test_opt(sb, BARRIER) && is_tail)
 614                write_flags |= REQ_FUA | REQ_PREFLUSH;
 615        lock_buffer(bh);
 616        set_buffer_dirty(bh);
 617        set_buffer_uptodate(bh);
 618        bh->b_end_io = ext4_end_buffer_io_sync;
 619        submit_bh(REQ_OP_WRITE, write_flags, bh);
 620        EXT4_SB(sb)->s_fc_bh = NULL;
 621}
 622
 623/* Ext4 commit path routines */
 624
 625/* memzero and update CRC */
 626static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
 627                                u32 *crc)
 628{
 629        void *ret;
 630
 631        ret = memset(dst, 0, len);
 632        if (crc)
 633                *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
 634        return ret;
 635}
 636
 637/*
 638 * Allocate len bytes on a fast commit buffer.
 639 *
 640 * During the commit time this function is used to manage fast commit
 641 * block space. We don't split a fast commit log onto different
 642 * blocks. So this function makes sure that if there's not enough space
 643 * on the current block, the remaining space in the current block is
 644 * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
 645 * new block is from jbd2 and CRC is updated to reflect the padding
 646 * we added.
 647 */
 648static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
 649{
 650        struct ext4_fc_tl *tl;
 651        struct ext4_sb_info *sbi = EXT4_SB(sb);
 652        struct buffer_head *bh;
 653        int bsize = sbi->s_journal->j_blocksize;
 654        int ret, off = sbi->s_fc_bytes % bsize;
 655        int pad_len;
 656
 657        /*
 658         * After allocating len, we should have space at least for a 0 byte
 659         * padding.
 660         */
 661        if (len + sizeof(struct ext4_fc_tl) > bsize)
 662                return NULL;
 663
 664        if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
 665                /*
 666                 * Only allocate from current buffer if we have enough space for
 667                 * this request AND we have space to add a zero byte padding.
 668                 */
 669                if (!sbi->s_fc_bh) {
 670                        ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
 671                        if (ret)
 672                                return NULL;
 673                        sbi->s_fc_bh = bh;
 674                }
 675                sbi->s_fc_bytes += len;
 676                return sbi->s_fc_bh->b_data + off;
 677        }
 678        /* Need to add PAD tag */
 679        tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
 680        tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
 681        pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
 682        tl->fc_len = cpu_to_le16(pad_len);
 683        if (crc)
 684                *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
 685        if (pad_len > 0)
 686                ext4_fc_memzero(sb, tl + 1, pad_len, crc);
 687        ext4_fc_submit_bh(sb, false);
 688
 689        ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
 690        if (ret)
 691                return NULL;
 692        sbi->s_fc_bh = bh;
 693        sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
 694        return sbi->s_fc_bh->b_data;
 695}
 696
 697/* memcpy to fc reserved space and update CRC */
 698static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
 699                                int len, u32 *crc)
 700{
 701        if (crc)
 702                *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
 703        return memcpy(dst, src, len);
 704}
 705
 706/*
 707 * Complete a fast commit by writing tail tag.
 708 *
 709 * Writing tail tag marks the end of a fast commit. In order to guarantee
 710 * atomicity, after writing tail tag, even if there's space remaining
 711 * in the block, next commit shouldn't use it. That's why tail tag
 712 * has the length as that of the remaining space on the block.
 713 */
 714static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
 715{
 716        struct ext4_sb_info *sbi = EXT4_SB(sb);
 717        struct ext4_fc_tl tl;
 718        struct ext4_fc_tail tail;
 719        int off, bsize = sbi->s_journal->j_blocksize;
 720        u8 *dst;
 721
 722        /*
 723         * ext4_fc_reserve_space takes care of allocating an extra block if
 724         * there's no enough space on this block for accommodating this tail.
 725         */
 726        dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
 727        if (!dst)
 728                return -ENOSPC;
 729
 730        off = sbi->s_fc_bytes % bsize;
 731
 732        tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
 733        tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
 734        sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
 735
 736        ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
 737        dst += sizeof(tl);
 738        tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
 739        ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
 740        dst += sizeof(tail.fc_tid);
 741        tail.fc_crc = cpu_to_le32(crc);
 742        ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
 743
 744        ext4_fc_submit_bh(sb, true);
 745
 746        return 0;
 747}
 748
 749/*
 750 * Adds tag, length, value and updates CRC. Returns true if tlv was added.
 751 * Returns false if there's not enough space.
 752 */
 753static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
 754                           u32 *crc)
 755{
 756        struct ext4_fc_tl tl;
 757        u8 *dst;
 758
 759        dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
 760        if (!dst)
 761                return false;
 762
 763        tl.fc_tag = cpu_to_le16(tag);
 764        tl.fc_len = cpu_to_le16(len);
 765
 766        ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
 767        ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
 768
 769        return true;
 770}
 771
 772/* Same as above, but adds dentry tlv. */
 773static  bool ext4_fc_add_dentry_tlv(struct super_block *sb, u16 tag,
 774                                        int parent_ino, int ino, int dlen,
 775                                        const unsigned char *dname,
 776                                        u32 *crc)
 777{
 778        struct ext4_fc_dentry_info fcd;
 779        struct ext4_fc_tl tl;
 780        u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
 781                                        crc);
 782
 783        if (!dst)
 784                return false;
 785
 786        fcd.fc_parent_ino = cpu_to_le32(parent_ino);
 787        fcd.fc_ino = cpu_to_le32(ino);
 788        tl.fc_tag = cpu_to_le16(tag);
 789        tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
 790        ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
 791        dst += sizeof(tl);
 792        ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
 793        dst += sizeof(fcd);
 794        ext4_fc_memcpy(sb, dst, dname, dlen, crc);
 795        dst += dlen;
 796
 797        return true;
 798}
 799
 800/*
 801 * Writes inode in the fast commit space under TLV with tag @tag.
 802 * Returns 0 on success, error on failure.
 803 */
 804static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
 805{
 806        struct ext4_inode_info *ei = EXT4_I(inode);
 807        int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
 808        int ret;
 809        struct ext4_iloc iloc;
 810        struct ext4_fc_inode fc_inode;
 811        struct ext4_fc_tl tl;
 812        u8 *dst;
 813
 814        ret = ext4_get_inode_loc(inode, &iloc);
 815        if (ret)
 816                return ret;
 817
 818        if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
 819                inode_len += ei->i_extra_isize;
 820
 821        fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
 822        tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
 823        tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
 824
 825        dst = ext4_fc_reserve_space(inode->i_sb,
 826                        sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
 827        if (!dst)
 828                return -ECANCELED;
 829
 830        if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
 831                return -ECANCELED;
 832        dst += sizeof(tl);
 833        if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
 834                return -ECANCELED;
 835        dst += sizeof(fc_inode);
 836        if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
 837                                        inode_len, crc))
 838                return -ECANCELED;
 839
 840        return 0;
 841}
 842
 843/*
 844 * Writes updated data ranges for the inode in question. Updates CRC.
 845 * Returns 0 on success, error otherwise.
 846 */
 847static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
 848{
 849        ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
 850        struct ext4_inode_info *ei = EXT4_I(inode);
 851        struct ext4_map_blocks map;
 852        struct ext4_fc_add_range fc_ext;
 853        struct ext4_fc_del_range lrange;
 854        struct ext4_extent *ex;
 855        int ret;
 856
 857        mutex_lock(&ei->i_fc_lock);
 858        if (ei->i_fc_lblk_len == 0) {
 859                mutex_unlock(&ei->i_fc_lock);
 860                return 0;
 861        }
 862        old_blk_size = ei->i_fc_lblk_start;
 863        new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
 864        ei->i_fc_lblk_len = 0;
 865        mutex_unlock(&ei->i_fc_lock);
 866
 867        cur_lblk_off = old_blk_size;
 868        jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
 869                  __func__, cur_lblk_off, new_blk_size, inode->i_ino);
 870
 871        while (cur_lblk_off <= new_blk_size) {
 872                map.m_lblk = cur_lblk_off;
 873                map.m_len = new_blk_size - cur_lblk_off + 1;
 874                ret = ext4_map_blocks(NULL, inode, &map, 0);
 875                if (ret < 0)
 876                        return -ECANCELED;
 877
 878                if (map.m_len == 0) {
 879                        cur_lblk_off++;
 880                        continue;
 881                }
 882
 883                if (ret == 0) {
 884                        lrange.fc_ino = cpu_to_le32(inode->i_ino);
 885                        lrange.fc_lblk = cpu_to_le32(map.m_lblk);
 886                        lrange.fc_len = cpu_to_le32(map.m_len);
 887                        if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
 888                                            sizeof(lrange), (u8 *)&lrange, crc))
 889                                return -ENOSPC;
 890                } else {
 891                        fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
 892                        ex = (struct ext4_extent *)&fc_ext.fc_ex;
 893                        ex->ee_block = cpu_to_le32(map.m_lblk);
 894                        ex->ee_len = cpu_to_le16(map.m_len);
 895                        ext4_ext_store_pblock(ex, map.m_pblk);
 896                        if (map.m_flags & EXT4_MAP_UNWRITTEN)
 897                                ext4_ext_mark_unwritten(ex);
 898                        else
 899                                ext4_ext_mark_initialized(ex);
 900                        if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
 901                                            sizeof(fc_ext), (u8 *)&fc_ext, crc))
 902                                return -ENOSPC;
 903                }
 904
 905                cur_lblk_off += map.m_len;
 906        }
 907
 908        return 0;
 909}
 910
 911
 912/* Submit data for all the fast commit inodes */
 913static int ext4_fc_submit_inode_data_all(journal_t *journal)
 914{
 915        struct super_block *sb = (struct super_block *)(journal->j_private);
 916        struct ext4_sb_info *sbi = EXT4_SB(sb);
 917        struct ext4_inode_info *ei;
 918        struct list_head *pos;
 919        int ret = 0;
 920
 921        spin_lock(&sbi->s_fc_lock);
 922        ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING);
 923        list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
 924                ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
 925                ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
 926                while (atomic_read(&ei->i_fc_updates)) {
 927                        DEFINE_WAIT(wait);
 928
 929                        prepare_to_wait(&ei->i_fc_wait, &wait,
 930                                                TASK_UNINTERRUPTIBLE);
 931                        if (atomic_read(&ei->i_fc_updates)) {
 932                                spin_unlock(&sbi->s_fc_lock);
 933                                schedule();
 934                                spin_lock(&sbi->s_fc_lock);
 935                        }
 936                        finish_wait(&ei->i_fc_wait, &wait);
 937                }
 938                spin_unlock(&sbi->s_fc_lock);
 939                ret = jbd2_submit_inode_data(ei->jinode);
 940                if (ret)
 941                        return ret;
 942                spin_lock(&sbi->s_fc_lock);
 943        }
 944        spin_unlock(&sbi->s_fc_lock);
 945
 946        return ret;
 947}
 948
 949/* Wait for completion of data for all the fast commit inodes */
 950static int ext4_fc_wait_inode_data_all(journal_t *journal)
 951{
 952        struct super_block *sb = (struct super_block *)(journal->j_private);
 953        struct ext4_sb_info *sbi = EXT4_SB(sb);
 954        struct ext4_inode_info *pos, *n;
 955        int ret = 0;
 956
 957        spin_lock(&sbi->s_fc_lock);
 958        list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
 959                if (!ext4_test_inode_state(&pos->vfs_inode,
 960                                           EXT4_STATE_FC_COMMITTING))
 961                        continue;
 962                spin_unlock(&sbi->s_fc_lock);
 963
 964                ret = jbd2_wait_inode_data(journal, pos->jinode);
 965                if (ret)
 966                        return ret;
 967                spin_lock(&sbi->s_fc_lock);
 968        }
 969        spin_unlock(&sbi->s_fc_lock);
 970
 971        return 0;
 972}
 973
 974/* Commit all the directory entry updates */
 975static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
 976__acquires(&sbi->s_fc_lock)
 977__releases(&sbi->s_fc_lock)
 978{
 979        struct super_block *sb = (struct super_block *)(journal->j_private);
 980        struct ext4_sb_info *sbi = EXT4_SB(sb);
 981        struct ext4_fc_dentry_update *fc_dentry;
 982        struct inode *inode;
 983        struct list_head *pos, *n, *fcd_pos, *fcd_n;
 984        struct ext4_inode_info *ei;
 985        int ret;
 986
 987        if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
 988                return 0;
 989        list_for_each_safe(fcd_pos, fcd_n, &sbi->s_fc_dentry_q[FC_Q_MAIN]) {
 990                fc_dentry = list_entry(fcd_pos, struct ext4_fc_dentry_update,
 991                                        fcd_list);
 992                if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
 993                        spin_unlock(&sbi->s_fc_lock);
 994                        if (!ext4_fc_add_dentry_tlv(
 995                                sb, fc_dentry->fcd_op,
 996                                fc_dentry->fcd_parent, fc_dentry->fcd_ino,
 997                                fc_dentry->fcd_name.len,
 998                                fc_dentry->fcd_name.name, crc)) {
 999                                ret = -ENOSPC;
1000                                goto lock_and_exit;
1001                        }
1002                        spin_lock(&sbi->s_fc_lock);
1003                        continue;
1004                }
1005
1006                inode = NULL;
1007                list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
1008                        ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
1009                        if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
1010                                inode = &ei->vfs_inode;
1011                                break;
1012                        }
1013                }
1014                /*
1015                 * If we don't find inode in our list, then it was deleted,
1016                 * in which case, we don't need to record it's create tag.
1017                 */
1018                if (!inode)
1019                        continue;
1020                spin_unlock(&sbi->s_fc_lock);
1021
1022                /*
1023                 * We first write the inode and then the create dirent. This
1024                 * allows the recovery code to create an unnamed inode first
1025                 * and then link it to a directory entry. This allows us
1026                 * to use namei.c routines almost as is and simplifies
1027                 * the recovery code.
1028                 */
1029                ret = ext4_fc_write_inode(inode, crc);
1030                if (ret)
1031                        goto lock_and_exit;
1032
1033                ret = ext4_fc_write_inode_data(inode, crc);
1034                if (ret)
1035                        goto lock_and_exit;
1036
1037                if (!ext4_fc_add_dentry_tlv(
1038                        sb, fc_dentry->fcd_op,
1039                        fc_dentry->fcd_parent, fc_dentry->fcd_ino,
1040                        fc_dentry->fcd_name.len,
1041                        fc_dentry->fcd_name.name, crc)) {
1042                        ret = -ENOSPC;
1043                        goto lock_and_exit;
1044                }
1045
1046                spin_lock(&sbi->s_fc_lock);
1047        }
1048        return 0;
1049lock_and_exit:
1050        spin_lock(&sbi->s_fc_lock);
1051        return ret;
1052}
1053
1054static int ext4_fc_perform_commit(journal_t *journal)
1055{
1056        struct super_block *sb = (struct super_block *)(journal->j_private);
1057        struct ext4_sb_info *sbi = EXT4_SB(sb);
1058        struct ext4_inode_info *iter;
1059        struct ext4_fc_head head;
1060        struct list_head *pos;
1061        struct inode *inode;
1062        struct blk_plug plug;
1063        int ret = 0;
1064        u32 crc = 0;
1065
1066        ret = ext4_fc_submit_inode_data_all(journal);
1067        if (ret)
1068                return ret;
1069
1070        ret = ext4_fc_wait_inode_data_all(journal);
1071        if (ret)
1072                return ret;
1073
1074        /*
1075         * If file system device is different from journal device, issue a cache
1076         * flush before we start writing fast commit blocks.
1077         */
1078        if (journal->j_fs_dev != journal->j_dev)
1079                blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS);
1080
1081        blk_start_plug(&plug);
1082        if (sbi->s_fc_bytes == 0) {
1083                /*
1084                 * Add a head tag only if this is the first fast commit
1085                 * in this TID.
1086                 */
1087                head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1088                head.fc_tid = cpu_to_le32(
1089                        sbi->s_journal->j_running_transaction->t_tid);
1090                if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1091                        (u8 *)&head, &crc))
1092                        goto out;
1093        }
1094
1095        spin_lock(&sbi->s_fc_lock);
1096        ret = ext4_fc_commit_dentry_updates(journal, &crc);
1097        if (ret) {
1098                spin_unlock(&sbi->s_fc_lock);
1099                goto out;
1100        }
1101
1102        list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
1103                iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
1104                inode = &iter->vfs_inode;
1105                if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1106                        continue;
1107
1108                spin_unlock(&sbi->s_fc_lock);
1109                ret = ext4_fc_write_inode_data(inode, &crc);
1110                if (ret)
1111                        goto out;
1112                ret = ext4_fc_write_inode(inode, &crc);
1113                if (ret)
1114                        goto out;
1115                spin_lock(&sbi->s_fc_lock);
1116        }
1117        spin_unlock(&sbi->s_fc_lock);
1118
1119        ret = ext4_fc_write_tail(sb, crc);
1120
1121out:
1122        blk_finish_plug(&plug);
1123        return ret;
1124}
1125
1126/*
1127 * The main commit entry point. Performs a fast commit for transaction
1128 * commit_tid if needed. If it's not possible to perform a fast commit
1129 * due to various reasons, we fall back to full commit. Returns 0
1130 * on success, error otherwise.
1131 */
1132int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1133{
1134        struct super_block *sb = (struct super_block *)(journal->j_private);
1135        struct ext4_sb_info *sbi = EXT4_SB(sb);
1136        int nblks = 0, ret, bsize = journal->j_blocksize;
1137        int subtid = atomic_read(&sbi->s_fc_subtid);
1138        int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0;
1139        ktime_t start_time, commit_time;
1140
1141        trace_ext4_fc_commit_start(sb);
1142
1143        start_time = ktime_get();
1144
1145        if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
1146                (ext4_fc_is_ineligible(sb))) {
1147                reason = EXT4_FC_REASON_INELIGIBLE;
1148                goto out;
1149        }
1150
1151restart_fc:
1152        ret = jbd2_fc_begin_commit(journal, commit_tid);
1153        if (ret == -EALREADY) {
1154                /* There was an ongoing commit, check if we need to restart */
1155                if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1156                        commit_tid > journal->j_commit_sequence)
1157                        goto restart_fc;
1158                reason = EXT4_FC_REASON_ALREADY_COMMITTED;
1159                goto out;
1160        } else if (ret) {
1161                sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1162                reason = EXT4_FC_REASON_FC_START_FAILED;
1163                goto out;
1164        }
1165
1166        fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1167        ret = ext4_fc_perform_commit(journal);
1168        if (ret < 0) {
1169                sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1170                reason = EXT4_FC_REASON_FC_FAILED;
1171                goto out;
1172        }
1173        nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1174        ret = jbd2_fc_wait_bufs(journal, nblks);
1175        if (ret < 0) {
1176                sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1177                reason = EXT4_FC_REASON_FC_FAILED;
1178                goto out;
1179        }
1180        atomic_inc(&sbi->s_fc_subtid);
1181        jbd2_fc_end_commit(journal);
1182out:
1183        /* Has any ineligible update happened since we started? */
1184        if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) {
1185                sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1186                reason = EXT4_FC_REASON_INELIGIBLE;
1187        }
1188
1189        spin_lock(&sbi->s_fc_lock);
1190        if (reason != EXT4_FC_REASON_OK &&
1191                reason != EXT4_FC_REASON_ALREADY_COMMITTED) {
1192                sbi->s_fc_stats.fc_ineligible_commits++;
1193        } else {
1194                sbi->s_fc_stats.fc_num_commits++;
1195                sbi->s_fc_stats.fc_numblks += nblks;
1196        }
1197        spin_unlock(&sbi->s_fc_lock);
1198        nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0;
1199        trace_ext4_fc_commit_stop(sb, nblks, reason);
1200        commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1201        /*
1202         * weight the commit time higher than the average time so we don't
1203         * react too strongly to vast changes in the commit time
1204         */
1205        if (likely(sbi->s_fc_avg_commit_time))
1206                sbi->s_fc_avg_commit_time = (commit_time +
1207                                sbi->s_fc_avg_commit_time * 3) / 4;
1208        else
1209                sbi->s_fc_avg_commit_time = commit_time;
1210        jbd_debug(1,
1211                "Fast commit ended with blks = %d, reason = %d, subtid - %d",
1212                nblks, reason, subtid);
1213        if (reason == EXT4_FC_REASON_FC_FAILED)
1214                return jbd2_fc_end_commit_fallback(journal);
1215        if (reason == EXT4_FC_REASON_FC_START_FAILED ||
1216                reason == EXT4_FC_REASON_INELIGIBLE)
1217                return jbd2_complete_transaction(journal, commit_tid);
1218        return 0;
1219}
1220
1221/*
1222 * Fast commit cleanup routine. This is called after every fast commit and
1223 * full commit. full is true if we are called after a full commit.
1224 */
1225static void ext4_fc_cleanup(journal_t *journal, int full)
1226{
1227        struct super_block *sb = journal->j_private;
1228        struct ext4_sb_info *sbi = EXT4_SB(sb);
1229        struct ext4_inode_info *iter;
1230        struct ext4_fc_dentry_update *fc_dentry;
1231        struct list_head *pos, *n;
1232
1233        if (full && sbi->s_fc_bh)
1234                sbi->s_fc_bh = NULL;
1235
1236        jbd2_fc_release_bufs(journal);
1237
1238        spin_lock(&sbi->s_fc_lock);
1239        list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
1240                iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
1241                list_del_init(&iter->i_fc_list);
1242                ext4_clear_inode_state(&iter->vfs_inode,
1243                                       EXT4_STATE_FC_COMMITTING);
1244                ext4_fc_reset_inode(&iter->vfs_inode);
1245                /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1246                smp_mb();
1247#if (BITS_PER_LONG < 64)
1248                wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1249#else
1250                wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1251#endif
1252        }
1253
1254        while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1255                fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1256                                             struct ext4_fc_dentry_update,
1257                                             fcd_list);
1258                list_del_init(&fc_dentry->fcd_list);
1259                spin_unlock(&sbi->s_fc_lock);
1260
1261                if (fc_dentry->fcd_name.name &&
1262                        fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1263                        kfree(fc_dentry->fcd_name.name);
1264                kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1265                spin_lock(&sbi->s_fc_lock);
1266        }
1267
1268        list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1269                                &sbi->s_fc_dentry_q[FC_Q_MAIN]);
1270        list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1271                                &sbi->s_fc_q[FC_Q_MAIN]);
1272
1273        ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING);
1274        ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
1275
1276        if (full)
1277                sbi->s_fc_bytes = 0;
1278        spin_unlock(&sbi->s_fc_lock);
1279        trace_ext4_fc_stats(sb);
1280}
1281
1282/* Ext4 Replay Path Routines */
1283
1284/* Helper struct for dentry replay routines */
1285struct dentry_info_args {
1286        int parent_ino, dname_len, ino, inode_len;
1287        char *dname;
1288};
1289
1290static inline void tl_to_darg(struct dentry_info_args *darg,
1291                                struct  ext4_fc_tl *tl)
1292{
1293        struct ext4_fc_dentry_info *fcd;
1294
1295        fcd = (struct ext4_fc_dentry_info *)ext4_fc_tag_val(tl);
1296
1297        darg->parent_ino = le32_to_cpu(fcd->fc_parent_ino);
1298        darg->ino = le32_to_cpu(fcd->fc_ino);
1299        darg->dname = fcd->fc_dname;
1300        darg->dname_len = ext4_fc_tag_len(tl) -
1301                        sizeof(struct ext4_fc_dentry_info);
1302}
1303
1304/* Unlink replay function */
1305static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl)
1306{
1307        struct inode *inode, *old_parent;
1308        struct qstr entry;
1309        struct dentry_info_args darg;
1310        int ret = 0;
1311
1312        tl_to_darg(&darg, tl);
1313
1314        trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1315                        darg.parent_ino, darg.dname_len);
1316
1317        entry.name = darg.dname;
1318        entry.len = darg.dname_len;
1319        inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1320
1321        if (IS_ERR(inode)) {
1322                jbd_debug(1, "Inode %d not found", darg.ino);
1323                return 0;
1324        }
1325
1326        old_parent = ext4_iget(sb, darg.parent_ino,
1327                                EXT4_IGET_NORMAL);
1328        if (IS_ERR(old_parent)) {
1329                jbd_debug(1, "Dir with inode  %d not found", darg.parent_ino);
1330                iput(inode);
1331                return 0;
1332        }
1333
1334        ret = __ext4_unlink(NULL, old_parent, &entry, inode);
1335        /* -ENOENT ok coz it might not exist anymore. */
1336        if (ret == -ENOENT)
1337                ret = 0;
1338        iput(old_parent);
1339        iput(inode);
1340        return ret;
1341}
1342
1343static int ext4_fc_replay_link_internal(struct super_block *sb,
1344                                struct dentry_info_args *darg,
1345                                struct inode *inode)
1346{
1347        struct inode *dir = NULL;
1348        struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1349        struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1350        int ret = 0;
1351
1352        dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1353        if (IS_ERR(dir)) {
1354                jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
1355                dir = NULL;
1356                goto out;
1357        }
1358
1359        dentry_dir = d_obtain_alias(dir);
1360        if (IS_ERR(dentry_dir)) {
1361                jbd_debug(1, "Failed to obtain dentry");
1362                dentry_dir = NULL;
1363                goto out;
1364        }
1365
1366        dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1367        if (!dentry_inode) {
1368                jbd_debug(1, "Inode dentry not created.");
1369                ret = -ENOMEM;
1370                goto out;
1371        }
1372
1373        ret = __ext4_link(dir, inode, dentry_inode);
1374        /*
1375         * It's possible that link already existed since data blocks
1376         * for the dir in question got persisted before we crashed OR
1377         * we replayed this tag and crashed before the entire replay
1378         * could complete.
1379         */
1380        if (ret && ret != -EEXIST) {
1381                jbd_debug(1, "Failed to link\n");
1382                goto out;
1383        }
1384
1385        ret = 0;
1386out:
1387        if (dentry_dir) {
1388                d_drop(dentry_dir);
1389                dput(dentry_dir);
1390        } else if (dir) {
1391                iput(dir);
1392        }
1393        if (dentry_inode) {
1394                d_drop(dentry_inode);
1395                dput(dentry_inode);
1396        }
1397
1398        return ret;
1399}
1400
1401/* Link replay function */
1402static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl)
1403{
1404        struct inode *inode;
1405        struct dentry_info_args darg;
1406        int ret = 0;
1407
1408        tl_to_darg(&darg, tl);
1409        trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1410                        darg.parent_ino, darg.dname_len);
1411
1412        inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1413        if (IS_ERR(inode)) {
1414                jbd_debug(1, "Inode not found.");
1415                return 0;
1416        }
1417
1418        ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1419        iput(inode);
1420        return ret;
1421}
1422
1423/*
1424 * Record all the modified inodes during replay. We use this later to setup
1425 * block bitmaps correctly.
1426 */
1427static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1428{
1429        struct ext4_fc_replay_state *state;
1430        int i;
1431
1432        state = &EXT4_SB(sb)->s_fc_replay_state;
1433        for (i = 0; i < state->fc_modified_inodes_used; i++)
1434                if (state->fc_modified_inodes[i] == ino)
1435                        return 0;
1436        if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1437                state->fc_modified_inodes_size +=
1438                        EXT4_FC_REPLAY_REALLOC_INCREMENT;
1439                state->fc_modified_inodes = krealloc(
1440                                        state->fc_modified_inodes, sizeof(int) *
1441                                        state->fc_modified_inodes_size,
1442                                        GFP_KERNEL);
1443                if (!state->fc_modified_inodes)
1444                        return -ENOMEM;
1445        }
1446        state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1447        return 0;
1448}
1449
1450/*
1451 * Inode replay function
1452 */
1453static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl)
1454{
1455        struct ext4_fc_inode *fc_inode;
1456        struct ext4_inode *raw_inode;
1457        struct ext4_inode *raw_fc_inode;
1458        struct inode *inode = NULL;
1459        struct ext4_iloc iloc;
1460        int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
1461        struct ext4_extent_header *eh;
1462
1463        fc_inode = (struct ext4_fc_inode *)ext4_fc_tag_val(tl);
1464
1465        ino = le32_to_cpu(fc_inode->fc_ino);
1466        trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1467
1468        inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1469        if (!IS_ERR(inode)) {
1470                ext4_ext_clear_bb(inode);
1471                iput(inode);
1472        }
1473        inode = NULL;
1474
1475        ext4_fc_record_modified_inode(sb, ino);
1476
1477        raw_fc_inode = (struct ext4_inode *)fc_inode->fc_raw_inode;
1478        ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1479        if (ret)
1480                goto out;
1481
1482        inode_len = ext4_fc_tag_len(tl) - sizeof(struct ext4_fc_inode);
1483        raw_inode = ext4_raw_inode(&iloc);
1484
1485        memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1486        memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
1487                inode_len - offsetof(struct ext4_inode, i_generation));
1488        if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1489                eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1490                if (eh->eh_magic != EXT4_EXT_MAGIC) {
1491                        memset(eh, 0, sizeof(*eh));
1492                        eh->eh_magic = EXT4_EXT_MAGIC;
1493                        eh->eh_max = cpu_to_le16(
1494                                (sizeof(raw_inode->i_block) -
1495                                 sizeof(struct ext4_extent_header))
1496                                 / sizeof(struct ext4_extent));
1497                }
1498        } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1499                memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1500                        sizeof(raw_inode->i_block));
1501        }
1502
1503        /* Immediately update the inode on disk. */
1504        ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1505        if (ret)
1506                goto out;
1507        ret = sync_dirty_buffer(iloc.bh);
1508        if (ret)
1509                goto out;
1510        ret = ext4_mark_inode_used(sb, ino);
1511        if (ret)
1512                goto out;
1513
1514        /* Given that we just wrote the inode on disk, this SHOULD succeed. */
1515        inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1516        if (IS_ERR(inode)) {
1517                jbd_debug(1, "Inode not found.");
1518                return -EFSCORRUPTED;
1519        }
1520
1521        /*
1522         * Our allocator could have made different decisions than before
1523         * crashing. This should be fixed but until then, we calculate
1524         * the number of blocks the inode.
1525         */
1526        ext4_ext_replay_set_iblocks(inode);
1527
1528        inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1529        ext4_reset_inode_seed(inode);
1530
1531        ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1532        ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1533        sync_dirty_buffer(iloc.bh);
1534        brelse(iloc.bh);
1535out:
1536        iput(inode);
1537        if (!ret)
1538                blkdev_issue_flush(sb->s_bdev, GFP_KERNEL);
1539
1540        return 0;
1541}
1542
1543/*
1544 * Dentry create replay function.
1545 *
1546 * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1547 * inode for which we are trying to create a dentry here, should already have
1548 * been replayed before we start here.
1549 */
1550static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl)
1551{
1552        int ret = 0;
1553        struct inode *inode = NULL;
1554        struct inode *dir = NULL;
1555        struct dentry_info_args darg;
1556
1557        tl_to_darg(&darg, tl);
1558
1559        trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1560                        darg.parent_ino, darg.dname_len);
1561
1562        /* This takes care of update group descriptor and other metadata */
1563        ret = ext4_mark_inode_used(sb, darg.ino);
1564        if (ret)
1565                goto out;
1566
1567        inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1568        if (IS_ERR(inode)) {
1569                jbd_debug(1, "inode %d not found.", darg.ino);
1570                inode = NULL;
1571                ret = -EINVAL;
1572                goto out;
1573        }
1574
1575        if (S_ISDIR(inode->i_mode)) {
1576                /*
1577                 * If we are creating a directory, we need to make sure that the
1578                 * dot and dot dot dirents are setup properly.
1579                 */
1580                dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1581                if (IS_ERR(dir)) {
1582                        jbd_debug(1, "Dir %d not found.", darg.ino);
1583                        goto out;
1584                }
1585                ret = ext4_init_new_dir(NULL, dir, inode);
1586                iput(dir);
1587                if (ret) {
1588                        ret = 0;
1589                        goto out;
1590                }
1591        }
1592        ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1593        if (ret)
1594                goto out;
1595        set_nlink(inode, 1);
1596        ext4_mark_inode_dirty(NULL, inode);
1597out:
1598        if (inode)
1599                iput(inode);
1600        return ret;
1601}
1602
1603/*
1604 * Record physical disk regions which are in use as per fast commit area. Our
1605 * simple replay phase allocator excludes these regions from allocation.
1606 */
1607static int ext4_fc_record_regions(struct super_block *sb, int ino,
1608                ext4_lblk_t lblk, ext4_fsblk_t pblk, int len)
1609{
1610        struct ext4_fc_replay_state *state;
1611        struct ext4_fc_alloc_region *region;
1612
1613        state = &EXT4_SB(sb)->s_fc_replay_state;
1614        if (state->fc_regions_used == state->fc_regions_size) {
1615                state->fc_regions_size +=
1616                        EXT4_FC_REPLAY_REALLOC_INCREMENT;
1617                state->fc_regions = krealloc(
1618                                        state->fc_regions,
1619                                        state->fc_regions_size *
1620                                        sizeof(struct ext4_fc_alloc_region),
1621                                        GFP_KERNEL);
1622                if (!state->fc_regions)
1623                        return -ENOMEM;
1624        }
1625        region = &state->fc_regions[state->fc_regions_used++];
1626        region->ino = ino;
1627        region->lblk = lblk;
1628        region->pblk = pblk;
1629        region->len = len;
1630
1631        return 0;
1632}
1633
1634/* Replay add range tag */
1635static int ext4_fc_replay_add_range(struct super_block *sb,
1636                                struct ext4_fc_tl *tl)
1637{
1638        struct ext4_fc_add_range *fc_add_ex;
1639        struct ext4_extent newex, *ex;
1640        struct inode *inode;
1641        ext4_lblk_t start, cur;
1642        int remaining, len;
1643        ext4_fsblk_t start_pblk;
1644        struct ext4_map_blocks map;
1645        struct ext4_ext_path *path = NULL;
1646        int ret;
1647
1648        fc_add_ex = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
1649        ex = (struct ext4_extent *)&fc_add_ex->fc_ex;
1650
1651        trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1652                le32_to_cpu(fc_add_ex->fc_ino), le32_to_cpu(ex->ee_block),
1653                ext4_ext_get_actual_len(ex));
1654
1655        inode = ext4_iget(sb, le32_to_cpu(fc_add_ex->fc_ino),
1656                                EXT4_IGET_NORMAL);
1657        if (IS_ERR(inode)) {
1658                jbd_debug(1, "Inode not found.");
1659                return 0;
1660        }
1661
1662        ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1663
1664        start = le32_to_cpu(ex->ee_block);
1665        start_pblk = ext4_ext_pblock(ex);
1666        len = ext4_ext_get_actual_len(ex);
1667
1668        cur = start;
1669        remaining = len;
1670        jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1671                  start, start_pblk, len, ext4_ext_is_unwritten(ex),
1672                  inode->i_ino);
1673
1674        while (remaining > 0) {
1675                map.m_lblk = cur;
1676                map.m_len = remaining;
1677                map.m_pblk = 0;
1678                ret = ext4_map_blocks(NULL, inode, &map, 0);
1679
1680                if (ret < 0) {
1681                        iput(inode);
1682                        return 0;
1683                }
1684
1685                if (ret == 0) {
1686                        /* Range is not mapped */
1687                        path = ext4_find_extent(inode, cur, NULL, 0);
1688                        if (IS_ERR(path)) {
1689                                iput(inode);
1690                                return 0;
1691                        }
1692                        memset(&newex, 0, sizeof(newex));
1693                        newex.ee_block = cpu_to_le32(cur);
1694                        ext4_ext_store_pblock(
1695                                &newex, start_pblk + cur - start);
1696                        newex.ee_len = cpu_to_le16(map.m_len);
1697                        if (ext4_ext_is_unwritten(ex))
1698                                ext4_ext_mark_unwritten(&newex);
1699                        down_write(&EXT4_I(inode)->i_data_sem);
1700                        ret = ext4_ext_insert_extent(
1701                                NULL, inode, &path, &newex, 0);
1702                        up_write((&EXT4_I(inode)->i_data_sem));
1703                        ext4_ext_drop_refs(path);
1704                        kfree(path);
1705                        if (ret) {
1706                                iput(inode);
1707                                return 0;
1708                        }
1709                        goto next;
1710                }
1711
1712                if (start_pblk + cur - start != map.m_pblk) {
1713                        /*
1714                         * Logical to physical mapping changed. This can happen
1715                         * if this range was removed and then reallocated to
1716                         * map to new physical blocks during a fast commit.
1717                         */
1718                        ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1719                                        ext4_ext_is_unwritten(ex),
1720                                        start_pblk + cur - start);
1721                        if (ret) {
1722                                iput(inode);
1723                                return 0;
1724                        }
1725                        /*
1726                         * Mark the old blocks as free since they aren't used
1727                         * anymore. We maintain an array of all the modified
1728                         * inodes. In case these blocks are still used at either
1729                         * a different logical range in the same inode or in
1730                         * some different inode, we will mark them as allocated
1731                         * at the end of the FC replay using our array of
1732                         * modified inodes.
1733                         */
1734                        ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1735                        goto next;
1736                }
1737
1738                /* Range is mapped and needs a state change */
1739                jbd_debug(1, "Converting from %d to %d %lld",
1740                                map.m_flags & EXT4_MAP_UNWRITTEN,
1741                        ext4_ext_is_unwritten(ex), map.m_pblk);
1742                ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1743                                        ext4_ext_is_unwritten(ex), map.m_pblk);
1744                if (ret) {
1745                        iput(inode);
1746                        return 0;
1747                }
1748                /*
1749                 * We may have split the extent tree while toggling the state.
1750                 * Try to shrink the extent tree now.
1751                 */
1752                ext4_ext_replay_shrink_inode(inode, start + len);
1753next:
1754                cur += map.m_len;
1755                remaining -= map.m_len;
1756        }
1757        ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1758                                        sb->s_blocksize_bits);
1759        iput(inode);
1760        return 0;
1761}
1762
1763/* Replay DEL_RANGE tag */
1764static int
1765ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl)
1766{
1767        struct inode *inode;
1768        struct ext4_fc_del_range *lrange;
1769        struct ext4_map_blocks map;
1770        ext4_lblk_t cur, remaining;
1771        int ret;
1772
1773        lrange = (struct ext4_fc_del_range *)ext4_fc_tag_val(tl);
1774        cur = le32_to_cpu(lrange->fc_lblk);
1775        remaining = le32_to_cpu(lrange->fc_len);
1776
1777        trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1778                le32_to_cpu(lrange->fc_ino), cur, remaining);
1779
1780        inode = ext4_iget(sb, le32_to_cpu(lrange->fc_ino), EXT4_IGET_NORMAL);
1781        if (IS_ERR(inode)) {
1782                jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange->fc_ino));
1783                return 0;
1784        }
1785
1786        ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1787
1788        jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
1789                        inode->i_ino, le32_to_cpu(lrange->fc_lblk),
1790                        le32_to_cpu(lrange->fc_len));
1791        while (remaining > 0) {
1792                map.m_lblk = cur;
1793                map.m_len = remaining;
1794
1795                ret = ext4_map_blocks(NULL, inode, &map, 0);
1796                if (ret < 0) {
1797                        iput(inode);
1798                        return 0;
1799                }
1800                if (ret > 0) {
1801                        remaining -= ret;
1802                        cur += ret;
1803                        ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1804                } else {
1805                        remaining -= map.m_len;
1806                        cur += map.m_len;
1807                }
1808        }
1809
1810        ret = ext4_punch_hole(inode,
1811                le32_to_cpu(lrange->fc_lblk) << sb->s_blocksize_bits,
1812                le32_to_cpu(lrange->fc_len) <<  sb->s_blocksize_bits);
1813        if (ret)
1814                jbd_debug(1, "ext4_punch_hole returned %d", ret);
1815        ext4_ext_replay_shrink_inode(inode,
1816                i_size_read(inode) >> sb->s_blocksize_bits);
1817        ext4_mark_inode_dirty(NULL, inode);
1818        iput(inode);
1819
1820        return 0;
1821}
1822
1823static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1824{
1825        struct ext4_fc_replay_state *state;
1826        struct inode *inode;
1827        struct ext4_ext_path *path = NULL;
1828        struct ext4_map_blocks map;
1829        int i, ret, j;
1830        ext4_lblk_t cur, end;
1831
1832        state = &EXT4_SB(sb)->s_fc_replay_state;
1833        for (i = 0; i < state->fc_modified_inodes_used; i++) {
1834                inode = ext4_iget(sb, state->fc_modified_inodes[i],
1835                        EXT4_IGET_NORMAL);
1836                if (IS_ERR(inode)) {
1837                        jbd_debug(1, "Inode %d not found.",
1838                                state->fc_modified_inodes[i]);
1839                        continue;
1840                }
1841                cur = 0;
1842                end = EXT_MAX_BLOCKS;
1843                while (cur < end) {
1844                        map.m_lblk = cur;
1845                        map.m_len = end - cur;
1846
1847                        ret = ext4_map_blocks(NULL, inode, &map, 0);
1848                        if (ret < 0)
1849                                break;
1850
1851                        if (ret > 0) {
1852                                path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1853                                if (!IS_ERR(path)) {
1854                                        for (j = 0; j < path->p_depth; j++)
1855                                                ext4_mb_mark_bb(inode->i_sb,
1856                                                        path[j].p_block, 1, 1);
1857                                        ext4_ext_drop_refs(path);
1858                                        kfree(path);
1859                                }
1860                                cur += ret;
1861                                ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1862                                                        map.m_len, 1);
1863                        } else {
1864                                cur = cur + (map.m_len ? map.m_len : 1);
1865                        }
1866                }
1867                iput(inode);
1868        }
1869}
1870
1871/*
1872 * Check if block is in excluded regions for block allocation. The simple
1873 * allocator that runs during replay phase is calls this function to see
1874 * if it is okay to use a block.
1875 */
1876bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1877{
1878        int i;
1879        struct ext4_fc_replay_state *state;
1880
1881        state = &EXT4_SB(sb)->s_fc_replay_state;
1882        for (i = 0; i < state->fc_regions_valid; i++) {
1883                if (state->fc_regions[i].ino == 0 ||
1884                        state->fc_regions[i].len == 0)
1885                        continue;
1886                if (blk >= state->fc_regions[i].pblk &&
1887                    blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
1888                        return true;
1889        }
1890        return false;
1891}
1892
1893/* Cleanup function called after replay */
1894void ext4_fc_replay_cleanup(struct super_block *sb)
1895{
1896        struct ext4_sb_info *sbi = EXT4_SB(sb);
1897
1898        sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1899        kfree(sbi->s_fc_replay_state.fc_regions);
1900        kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1901}
1902
1903/*
1904 * Recovery Scan phase handler
1905 *
1906 * This function is called during the scan phase and is responsible
1907 * for doing following things:
1908 * - Make sure the fast commit area has valid tags for replay
1909 * - Count number of tags that need to be replayed by the replay handler
1910 * - Verify CRC
1911 * - Create a list of excluded blocks for allocation during replay phase
1912 *
1913 * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1914 * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1915 * to indicate that scan has finished and JBD2 can now start replay phase.
1916 * It returns a negative error to indicate that there was an error. At the end
1917 * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1918 * to indicate the number of tags that need to replayed during the replay phase.
1919 */
1920static int ext4_fc_replay_scan(journal_t *journal,
1921                                struct buffer_head *bh, int off,
1922                                tid_t expected_tid)
1923{
1924        struct super_block *sb = journal->j_private;
1925        struct ext4_sb_info *sbi = EXT4_SB(sb);
1926        struct ext4_fc_replay_state *state;
1927        int ret = JBD2_FC_REPLAY_CONTINUE;
1928        struct ext4_fc_add_range *ext;
1929        struct ext4_fc_tl *tl;
1930        struct ext4_fc_tail *tail;
1931        __u8 *start, *end;
1932        struct ext4_fc_head *head;
1933        struct ext4_extent *ex;
1934
1935        state = &sbi->s_fc_replay_state;
1936
1937        start = (u8 *)bh->b_data;
1938        end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
1939
1940        if (state->fc_replay_expected_off == 0) {
1941                state->fc_cur_tag = 0;
1942                state->fc_replay_num_tags = 0;
1943                state->fc_crc = 0;
1944                state->fc_regions = NULL;
1945                state->fc_regions_valid = state->fc_regions_used =
1946                        state->fc_regions_size = 0;
1947                /* Check if we can stop early */
1948                if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
1949                        != EXT4_FC_TAG_HEAD)
1950                        return 0;
1951        }
1952
1953        if (off != state->fc_replay_expected_off) {
1954                ret = -EFSCORRUPTED;
1955                goto out_err;
1956        }
1957
1958        state->fc_replay_expected_off++;
1959        fc_for_each_tl(start, end, tl) {
1960                jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
1961                          tag2str(le16_to_cpu(tl->fc_tag)), bh->b_blocknr);
1962                switch (le16_to_cpu(tl->fc_tag)) {
1963                case EXT4_FC_TAG_ADD_RANGE:
1964                        ext = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
1965                        ex = (struct ext4_extent *)&ext->fc_ex;
1966                        ret = ext4_fc_record_regions(sb,
1967                                le32_to_cpu(ext->fc_ino),
1968                                le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
1969                                ext4_ext_get_actual_len(ex));
1970                        if (ret < 0)
1971                                break;
1972                        ret = JBD2_FC_REPLAY_CONTINUE;
1973                        fallthrough;
1974                case EXT4_FC_TAG_DEL_RANGE:
1975                case EXT4_FC_TAG_LINK:
1976                case EXT4_FC_TAG_UNLINK:
1977                case EXT4_FC_TAG_CREAT:
1978                case EXT4_FC_TAG_INODE:
1979                case EXT4_FC_TAG_PAD:
1980                        state->fc_cur_tag++;
1981                        state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1982                                        sizeof(*tl) + ext4_fc_tag_len(tl));
1983                        break;
1984                case EXT4_FC_TAG_TAIL:
1985                        state->fc_cur_tag++;
1986                        tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
1987                        state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1988                                                sizeof(*tl) +
1989                                                offsetof(struct ext4_fc_tail,
1990                                                fc_crc));
1991                        if (le32_to_cpu(tail->fc_tid) == expected_tid &&
1992                                le32_to_cpu(tail->fc_crc) == state->fc_crc) {
1993                                state->fc_replay_num_tags = state->fc_cur_tag;
1994                                state->fc_regions_valid =
1995                                        state->fc_regions_used;
1996                        } else {
1997                                ret = state->fc_replay_num_tags ?
1998                                        JBD2_FC_REPLAY_STOP : -EFSBADCRC;
1999                        }
2000                        state->fc_crc = 0;
2001                        break;
2002                case EXT4_FC_TAG_HEAD:
2003                        head = (struct ext4_fc_head *)ext4_fc_tag_val(tl);
2004                        if (le32_to_cpu(head->fc_features) &
2005                                ~EXT4_FC_SUPPORTED_FEATURES) {
2006                                ret = -EOPNOTSUPP;
2007                                break;
2008                        }
2009                        if (le32_to_cpu(head->fc_tid) != expected_tid) {
2010                                ret = JBD2_FC_REPLAY_STOP;
2011                                break;
2012                        }
2013                        state->fc_cur_tag++;
2014                        state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
2015                                        sizeof(*tl) + ext4_fc_tag_len(tl));
2016                        break;
2017                default:
2018                        ret = state->fc_replay_num_tags ?
2019                                JBD2_FC_REPLAY_STOP : -ECANCELED;
2020                }
2021                if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
2022                        break;
2023        }
2024
2025out_err:
2026        trace_ext4_fc_replay_scan(sb, ret, off);
2027        return ret;
2028}
2029
2030/*
2031 * Main recovery path entry point.
2032 * The meaning of return codes is similar as above.
2033 */
2034static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2035                                enum passtype pass, int off, tid_t expected_tid)
2036{
2037        struct super_block *sb = journal->j_private;
2038        struct ext4_sb_info *sbi = EXT4_SB(sb);
2039        struct ext4_fc_tl *tl;
2040        __u8 *start, *end;
2041        int ret = JBD2_FC_REPLAY_CONTINUE;
2042        struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2043        struct ext4_fc_tail *tail;
2044
2045        if (pass == PASS_SCAN) {
2046                state->fc_current_pass = PASS_SCAN;
2047                return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2048        }
2049
2050        if (state->fc_current_pass != pass) {
2051                state->fc_current_pass = pass;
2052                sbi->s_mount_state |= EXT4_FC_REPLAY;
2053        }
2054        if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2055                jbd_debug(1, "Replay stops\n");
2056                ext4_fc_set_bitmaps_and_counters(sb);
2057                return 0;
2058        }
2059
2060#ifdef CONFIG_EXT4_DEBUG
2061        if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2062                pr_warn("Dropping fc block %d because max_replay set\n", off);
2063                return JBD2_FC_REPLAY_STOP;
2064        }
2065#endif
2066
2067        start = (u8 *)bh->b_data;
2068        end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2069
2070        fc_for_each_tl(start, end, tl) {
2071                if (state->fc_replay_num_tags == 0) {
2072                        ret = JBD2_FC_REPLAY_STOP;
2073                        ext4_fc_set_bitmaps_and_counters(sb);
2074                        break;
2075                }
2076                jbd_debug(3, "Replay phase, tag:%s\n",
2077                                tag2str(le16_to_cpu(tl->fc_tag)));
2078                state->fc_replay_num_tags--;
2079                switch (le16_to_cpu(tl->fc_tag)) {
2080                case EXT4_FC_TAG_LINK:
2081                        ret = ext4_fc_replay_link(sb, tl);
2082                        break;
2083                case EXT4_FC_TAG_UNLINK:
2084                        ret = ext4_fc_replay_unlink(sb, tl);
2085                        break;
2086                case EXT4_FC_TAG_ADD_RANGE:
2087                        ret = ext4_fc_replay_add_range(sb, tl);
2088                        break;
2089                case EXT4_FC_TAG_CREAT:
2090                        ret = ext4_fc_replay_create(sb, tl);
2091                        break;
2092                case EXT4_FC_TAG_DEL_RANGE:
2093                        ret = ext4_fc_replay_del_range(sb, tl);
2094                        break;
2095                case EXT4_FC_TAG_INODE:
2096                        ret = ext4_fc_replay_inode(sb, tl);
2097                        break;
2098                case EXT4_FC_TAG_PAD:
2099                        trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2100                                ext4_fc_tag_len(tl), 0);
2101                        break;
2102                case EXT4_FC_TAG_TAIL:
2103                        trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
2104                                ext4_fc_tag_len(tl), 0);
2105                        tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
2106                        WARN_ON(le32_to_cpu(tail->fc_tid) != expected_tid);
2107                        break;
2108                case EXT4_FC_TAG_HEAD:
2109                        break;
2110                default:
2111                        trace_ext4_fc_replay(sb, le16_to_cpu(tl->fc_tag), 0,
2112                                ext4_fc_tag_len(tl), 0);
2113                        ret = -ECANCELED;
2114                        break;
2115                }
2116                if (ret < 0)
2117                        break;
2118                ret = JBD2_FC_REPLAY_CONTINUE;
2119        }
2120        return ret;
2121}
2122
2123void ext4_fc_init(struct super_block *sb, journal_t *journal)
2124{
2125        /*
2126         * We set replay callback even if fast commit disabled because we may
2127         * could still have fast commit blocks that need to be replayed even if
2128         * fast commit has now been turned off.
2129         */
2130        journal->j_fc_replay_callback = ext4_fc_replay;
2131        if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2132                return;
2133        journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2134}
2135
2136static const char *fc_ineligible_reasons[] = {
2137        "Extended attributes changed",
2138        "Cross rename",
2139        "Journal flag changed",
2140        "Insufficient memory",
2141        "Swap boot",
2142        "Resize",
2143        "Dir renamed",
2144        "Falloc range op",
2145        "Data journalling",
2146        "FC Commit Failed"
2147};
2148
2149int ext4_fc_info_show(struct seq_file *seq, void *v)
2150{
2151        struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2152        struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2153        int i;
2154
2155        if (v != SEQ_START_TOKEN)
2156                return 0;
2157
2158        seq_printf(seq,
2159                "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2160                   stats->fc_num_commits, stats->fc_ineligible_commits,
2161                   stats->fc_numblks,
2162                   div_u64(sbi->s_fc_avg_commit_time, 1000));
2163        seq_puts(seq, "Ineligible reasons:\n");
2164        for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2165                seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2166                        stats->fc_ineligible_reason_count[i]);
2167
2168        return 0;
2169}
2170
2171int __init ext4_fc_init_dentry_cache(void)
2172{
2173        ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2174                                           SLAB_RECLAIM_ACCOUNT);
2175
2176        if (ext4_fc_dentry_cachep == NULL)
2177                return -ENOMEM;
2178
2179        return 0;
2180}
2181