linux/fs/ext4/fast_commit.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2
   3/*
   4 * fs/ext4/fast_commit.c
   5 *
   6 * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
   7 *
   8 * Ext4 fast commits routines.
   9 */
  10#include "ext4.h"
  11#include "ext4_jbd2.h"
  12#include "ext4_extents.h"
  13#include "mballoc.h"
  14
  15/*
  16 * Ext4 Fast Commits
  17 * -----------------
  18 *
  19 * Ext4 fast commits implement fine grained journalling for Ext4.
  20 *
  21 * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
  22 * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
  23 * TLV during the recovery phase. For the scenarios for which we currently
  24 * don't have replay code, fast commit falls back to full commits.
  25 * Fast commits record delta in one of the following three categories.
  26 *
  27 * (A) Directory entry updates:
  28 *
  29 * - EXT4_FC_TAG_UNLINK         - records directory entry unlink
  30 * - EXT4_FC_TAG_LINK           - records directory entry link
  31 * - EXT4_FC_TAG_CREAT          - records inode and directory entry creation
  32 *
  33 * (B) File specific data range updates:
  34 *
  35 * - EXT4_FC_TAG_ADD_RANGE      - records addition of new blocks to an inode
  36 * - EXT4_FC_TAG_DEL_RANGE      - records deletion of blocks from an inode
  37 *
  38 * (C) Inode metadata (mtime / ctime etc):
  39 *
  40 * - EXT4_FC_TAG_INODE          - record the inode that should be replayed
  41 *                                during recovery. Note that iblocks field is
  42 *                                not replayed and instead derived during
  43 *                                replay.
  44 * Commit Operation
  45 * ----------------
  46 * With fast commits, we maintain all the directory entry operations in the
  47 * order in which they are issued in an in-memory queue. This queue is flushed
  48 * to disk during the commit operation. We also maintain a list of inodes
  49 * that need to be committed during a fast commit in another in memory queue of
  50 * inodes. During the commit operation, we commit in the following order:
  51 *
  52 * [1] Lock inodes for any further data updates by setting COMMITTING state
  53 * [2] Submit data buffers of all the inodes
  54 * [3] Wait for [2] to complete
  55 * [4] Commit all the directory entry updates in the fast commit space
  56 * [5] Commit all the changed inode structures
  57 * [6] Write tail tag (this tag ensures the atomicity, please read the following
  58 *     section for more details).
  59 * [7] Wait for [4], [5] and [6] to complete.
  60 *
  61 * All the inode updates must call ext4_fc_start_update() before starting an
  62 * update. If such an ongoing update is present, fast commit waits for it to
  63 * complete. The completion of such an update is marked by
  64 * ext4_fc_stop_update().
  65 *
  66 * Fast Commit Ineligibility
  67 * -------------------------
  68 * Not all operations are supported by fast commits today (e.g extended
  69 * attributes). Fast commit ineligibility is marked by calling one of the
  70 * two following functions:
  71 *
  72 * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall
  73 *   back to full commit. This is useful in case of transient errors.
  74 *
  75 * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all
  76 *   the fast commits happening between ext4_fc_start_ineligible() and
  77 *   ext4_fc_stop_ineligible() and one fast commit after the call to
  78 *   ext4_fc_stop_ineligible() to fall back to full commits. It is important to
  79 *   make one more fast commit to fall back to full commit after stop call so
  80 *   that it guaranteed that the fast commit ineligible operation contained
  81 *   within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is
  82 *   followed by at least 1 full commit.
  83 *
  84 * Atomicity of commits
  85 * --------------------
  86 * In order to guarantee atomicity during the commit operation, fast commit
  87 * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
  88 * tag contains CRC of the contents and TID of the transaction after which
  89 * this fast commit should be applied. Recovery code replays fast commit
  90 * logs only if there's at least 1 valid tail present. For every fast commit
  91 * operation, there is 1 tail. This means, we may end up with multiple tails
  92 * in the fast commit space. Here's an example:
  93 *
  94 * - Create a new file A and remove existing file B
  95 * - fsync()
  96 * - Append contents to file A
  97 * - Truncate file A
  98 * - fsync()
  99 *
 100 * The fast commit space at the end of above operations would look like this:
 101 *      [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
 102 *             |<---  Fast Commit 1   --->|<---      Fast Commit 2     ---->|
 103 *
 104 * Replay code should thus check for all the valid tails in the FC area.
 105 *
 106 * Fast Commit Replay Idempotence
 107 * ------------------------------
 108 *
 109 * Fast commits tags are idempotent in nature provided the recovery code follows
 110 * certain rules. The guiding principle that the commit path follows while
 111 * committing is that it stores the result of a particular operation instead of
 112 * storing the procedure.
 113 *
 114 * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
 115 * was associated with inode 10. During fast commit, instead of storing this
 116 * operation as a procedure "rename a to b", we store the resulting file system
 117 * state as a "series" of outcomes:
 118 *
 119 * - Link dirent b to inode 10
 120 * - Unlink dirent a
 121 * - Inode <10> with valid refcount
 122 *
 123 * Now when recovery code runs, it needs "enforce" this state on the file
 124 * system. This is what guarantees idempotence of fast commit replay.
 125 *
 126 * Let's take an example of a procedure that is not idempotent and see how fast
 127 * commits make it idempotent. Consider following sequence of operations:
 128 *
 129 *     rm A;    mv B A;    read A
 130 *  (x)     (y)        (z)
 131 *
 132 * (x), (y) and (z) are the points at which we can crash. If we store this
 133 * sequence of operations as is then the replay is not idempotent. Let's say
 134 * while in replay, we crash at (z). During the second replay, file A (which was
 135 * actually created as a result of "mv B A" operation) would get deleted. Thus,
 136 * file named A would be absent when we try to read A. So, this sequence of
 137 * operations is not idempotent. However, as mentioned above, instead of storing
 138 * the procedure fast commits store the outcome of each procedure. Thus the fast
 139 * commit log for above procedure would be as follows:
 140 *
 141 * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
 142 * inode 11 before the replay)
 143 *
 144 *    [Unlink A]   [Link A to inode 11]   [Unlink B]   [Inode 11]
 145 * (w)          (x)                    (y)          (z)
 146 *
 147 * If we crash at (z), we will have file A linked to inode 11. During the second
 148 * replay, we will remove file A (inode 11). But we will create it back and make
 149 * it point to inode 11. We won't find B, so we'll just skip that step. At this
 150 * point, the refcount for inode 11 is not reliable, but that gets fixed by the
 151 * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
 152 * similarly. Thus, by converting a non-idempotent procedure into a series of
 153 * idempotent outcomes, fast commits ensured idempotence during the replay.
 154 *
 155 * TODOs
 156 * -----
 157 *
 158 * 0) Fast commit replay path hardening: Fast commit replay code should use
 159 *    journal handles to make sure all the updates it does during the replay
 160 *    path are atomic. With that if we crash during fast commit replay, after
 161 *    trying to do recovery again, we will find a file system where fast commit
 162 *    area is invalid (because new full commit would be found). In order to deal
 163 *    with that, fast commit replay code should ensure that the "FC_REPLAY"
 164 *    superblock state is persisted before starting the replay, so that after
 165 *    the crash, fast commit recovery code can look at that flag and perform
 166 *    fast commit recovery even if that area is invalidated by later full
 167 *    commits.
 168 *
 169 * 1) Make fast commit atomic updates more fine grained. Today, a fast commit
 170 *    eligible update must be protected within ext4_fc_start_update() and
 171 *    ext4_fc_stop_update(). These routines are called at much higher
 172 *    routines. This can be made more fine grained by combining with
 173 *    ext4_journal_start().
 174 *
 175 * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible()
 176 *
 177 * 3) Handle more ineligible cases.
 178 */
 179
 180#include <trace/events/ext4.h>
 181static struct kmem_cache *ext4_fc_dentry_cachep;
 182
 183static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 184{
 185        BUFFER_TRACE(bh, "");
 186        if (uptodate) {
 187                ext4_debug("%s: Block %lld up-to-date",
 188                           __func__, bh->b_blocknr);
 189                set_buffer_uptodate(bh);
 190        } else {
 191                ext4_debug("%s: Block %lld not up-to-date",
 192                           __func__, bh->b_blocknr);
 193                clear_buffer_uptodate(bh);
 194        }
 195
 196        unlock_buffer(bh);
 197}
 198
 199static inline void ext4_fc_reset_inode(struct inode *inode)
 200{
 201        struct ext4_inode_info *ei = EXT4_I(inode);
 202
 203        ei->i_fc_lblk_start = 0;
 204        ei->i_fc_lblk_len = 0;
 205}
 206
 207void ext4_fc_init_inode(struct inode *inode)
 208{
 209        struct ext4_inode_info *ei = EXT4_I(inode);
 210
 211        ext4_fc_reset_inode(inode);
 212        ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
 213        INIT_LIST_HEAD(&ei->i_fc_list);
 214        init_waitqueue_head(&ei->i_fc_wait);
 215        atomic_set(&ei->i_fc_updates, 0);
 216}
 217
 218/* This function must be called with sbi->s_fc_lock held. */
 219static void ext4_fc_wait_committing_inode(struct inode *inode)
 220__releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
 221{
 222        wait_queue_head_t *wq;
 223        struct ext4_inode_info *ei = EXT4_I(inode);
 224
 225#if (BITS_PER_LONG < 64)
 226        DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
 227                        EXT4_STATE_FC_COMMITTING);
 228        wq = bit_waitqueue(&ei->i_state_flags,
 229                                EXT4_STATE_FC_COMMITTING);
 230#else
 231        DEFINE_WAIT_BIT(wait, &ei->i_flags,
 232                        EXT4_STATE_FC_COMMITTING);
 233        wq = bit_waitqueue(&ei->i_flags,
 234                                EXT4_STATE_FC_COMMITTING);
 235#endif
 236        lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
 237        prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
 238        spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 239        schedule();
 240        finish_wait(wq, &wait.wq_entry);
 241}
 242
 243/*
 244 * Inform Ext4's fast about start of an inode update
 245 *
 246 * This function is called by the high level call VFS callbacks before
 247 * performing any inode update. This function blocks if there's an ongoing
 248 * fast commit on the inode in question.
 249 */
 250void ext4_fc_start_update(struct inode *inode)
 251{
 252        struct ext4_inode_info *ei = EXT4_I(inode);
 253
 254        if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 255            (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 256                return;
 257
 258restart:
 259        spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 260        if (list_empty(&ei->i_fc_list))
 261                goto out;
 262
 263        if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
 264                ext4_fc_wait_committing_inode(inode);
 265                goto restart;
 266        }
 267out:
 268        atomic_inc(&ei->i_fc_updates);
 269        spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 270}
 271
 272/*
 273 * Stop inode update and wake up waiting fast commits if any.
 274 */
 275void ext4_fc_stop_update(struct inode *inode)
 276{
 277        struct ext4_inode_info *ei = EXT4_I(inode);
 278
 279        if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 280            (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 281                return;
 282
 283        if (atomic_dec_and_test(&ei->i_fc_updates))
 284                wake_up_all(&ei->i_fc_wait);
 285}
 286
 287/*
 288 * Remove inode from fast commit list. If the inode is being committed
 289 * we wait until inode commit is done.
 290 */
 291void ext4_fc_del(struct inode *inode)
 292{
 293        struct ext4_inode_info *ei = EXT4_I(inode);
 294
 295        if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 296            (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 297                return;
 298
 299restart:
 300        spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 301        if (list_empty(&ei->i_fc_list)) {
 302                spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 303                return;
 304        }
 305
 306        if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
 307                ext4_fc_wait_committing_inode(inode);
 308                goto restart;
 309        }
 310        list_del_init(&ei->i_fc_list);
 311        spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 312}
 313
 314/*
 315 * Mark file system as fast commit ineligible. This means that next commit
 316 * operation would result in a full jbd2 commit.
 317 */
 318void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
 319{
 320        struct ext4_sb_info *sbi = EXT4_SB(sb);
 321
 322        if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
 323            (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
 324                return;
 325
 326        ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
 327        WARN_ON(reason >= EXT4_FC_REASON_MAX);
 328        sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
 329}
 330
 331/*
 332 * Start a fast commit ineligible update. Any commits that happen while
 333 * such an operation is in progress fall back to full commits.
 334 */
 335void ext4_fc_start_ineligible(struct super_block *sb, int reason)
 336{
 337        struct ext4_sb_info *sbi = EXT4_SB(sb);
 338
 339        if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
 340            (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
 341                return;
 342
 343        WARN_ON(reason >= EXT4_FC_REASON_MAX);
 344        sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
 345        atomic_inc(&sbi->s_fc_ineligible_updates);
 346}
 347
 348/*
 349 * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here
 350 * to ensure that after stopping the ineligible update, at least one full
 351 * commit takes place.
 352 */
 353void ext4_fc_stop_ineligible(struct super_block *sb)
 354{
 355        if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
 356            (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
 357                return;
 358
 359        ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
 360        atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
 361}
 362
 363static inline int ext4_fc_is_ineligible(struct super_block *sb)
 364{
 365        return (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE) ||
 366                atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates));
 367}
 368
 369/*
 370 * Generic fast commit tracking function. If this is the first time this we are
 371 * called after a full commit, we initialize fast commit fields and then call
 372 * __fc_track_fn() with update = 0. If we have already been called after a full
 373 * commit, we pass update = 1. Based on that, the track function can determine
 374 * if it needs to track a field for the first time or if it needs to just
 375 * update the previously tracked value.
 376 *
 377 * If enqueue is set, this function enqueues the inode in fast commit list.
 378 */
 379static int ext4_fc_track_template(
 380        handle_t *handle, struct inode *inode,
 381        int (*__fc_track_fn)(struct inode *, void *, bool),
 382        void *args, int enqueue)
 383{
 384        bool update = false;
 385        struct ext4_inode_info *ei = EXT4_I(inode);
 386        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 387        tid_t tid = 0;
 388        int ret;
 389
 390        if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 391            (sbi->s_mount_state & EXT4_FC_REPLAY))
 392                return -EOPNOTSUPP;
 393
 394        if (ext4_fc_is_ineligible(inode->i_sb))
 395                return -EINVAL;
 396
 397        tid = handle->h_transaction->t_tid;
 398        mutex_lock(&ei->i_fc_lock);
 399        if (tid == ei->i_sync_tid) {
 400                update = true;
 401        } else {
 402                ext4_fc_reset_inode(inode);
 403                ei->i_sync_tid = tid;
 404        }
 405        ret = __fc_track_fn(inode, args, update);
 406        mutex_unlock(&ei->i_fc_lock);
 407
 408        if (!enqueue)
 409                return ret;
 410
 411        spin_lock(&sbi->s_fc_lock);
 412        if (list_empty(&EXT4_I(inode)->i_fc_list))
 413                list_add_tail(&EXT4_I(inode)->i_fc_list,
 414                                (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ?
 415                                &sbi->s_fc_q[FC_Q_STAGING] :
 416                                &sbi->s_fc_q[FC_Q_MAIN]);
 417        spin_unlock(&sbi->s_fc_lock);
 418
 419        return ret;
 420}
 421
 422struct __track_dentry_update_args {
 423        struct dentry *dentry;
 424        int op;
 425};
 426
 427/* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
 428static int __track_dentry_update(struct inode *inode, void *arg, bool update)
 429{
 430        struct ext4_fc_dentry_update *node;
 431        struct ext4_inode_info *ei = EXT4_I(inode);
 432        struct __track_dentry_update_args *dentry_update =
 433                (struct __track_dentry_update_args *)arg;
 434        struct dentry *dentry = dentry_update->dentry;
 435        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 436
 437        mutex_unlock(&ei->i_fc_lock);
 438        node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
 439        if (!node) {
 440                ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
 441                mutex_lock(&ei->i_fc_lock);
 442                return -ENOMEM;
 443        }
 444
 445        node->fcd_op = dentry_update->op;
 446        node->fcd_parent = dentry->d_parent->d_inode->i_ino;
 447        node->fcd_ino = inode->i_ino;
 448        if (dentry->d_name.len > DNAME_INLINE_LEN) {
 449                node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
 450                if (!node->fcd_name.name) {
 451                        kmem_cache_free(ext4_fc_dentry_cachep, node);
 452                        ext4_fc_mark_ineligible(inode->i_sb,
 453                                EXT4_FC_REASON_NOMEM);
 454                        mutex_lock(&ei->i_fc_lock);
 455                        return -ENOMEM;
 456                }
 457                memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
 458                        dentry->d_name.len);
 459        } else {
 460                memcpy(node->fcd_iname, dentry->d_name.name,
 461                        dentry->d_name.len);
 462                node->fcd_name.name = node->fcd_iname;
 463        }
 464        node->fcd_name.len = dentry->d_name.len;
 465
 466        spin_lock(&sbi->s_fc_lock);
 467        if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING))
 468                list_add_tail(&node->fcd_list,
 469                                &sbi->s_fc_dentry_q[FC_Q_STAGING]);
 470        else
 471                list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
 472        spin_unlock(&sbi->s_fc_lock);
 473        mutex_lock(&ei->i_fc_lock);
 474
 475        return 0;
 476}
 477
 478void __ext4_fc_track_unlink(handle_t *handle,
 479                struct inode *inode, struct dentry *dentry)
 480{
 481        struct __track_dentry_update_args args;
 482        int ret;
 483
 484        args.dentry = dentry;
 485        args.op = EXT4_FC_TAG_UNLINK;
 486
 487        ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 488                                        (void *)&args, 0);
 489        trace_ext4_fc_track_unlink(inode, dentry, ret);
 490}
 491
 492void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
 493{
 494        __ext4_fc_track_unlink(handle, d_inode(dentry), dentry);
 495}
 496
 497void __ext4_fc_track_link(handle_t *handle,
 498        struct inode *inode, struct dentry *dentry)
 499{
 500        struct __track_dentry_update_args args;
 501        int ret;
 502
 503        args.dentry = dentry;
 504        args.op = EXT4_FC_TAG_LINK;
 505
 506        ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 507                                        (void *)&args, 0);
 508        trace_ext4_fc_track_link(inode, dentry, ret);
 509}
 510
 511void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
 512{
 513        __ext4_fc_track_link(handle, d_inode(dentry), dentry);
 514}
 515
 516void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
 517                          struct dentry *dentry)
 518{
 519        struct __track_dentry_update_args args;
 520        int ret;
 521
 522        args.dentry = dentry;
 523        args.op = EXT4_FC_TAG_CREAT;
 524
 525        ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 526                                        (void *)&args, 0);
 527        trace_ext4_fc_track_create(inode, dentry, ret);
 528}
 529
 530void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
 531{
 532        __ext4_fc_track_create(handle, d_inode(dentry), dentry);
 533}
 534
 535/* __track_fn for inode tracking */
 536static int __track_inode(struct inode *inode, void *arg, bool update)
 537{
 538        if (update)
 539                return -EEXIST;
 540
 541        EXT4_I(inode)->i_fc_lblk_len = 0;
 542
 543        return 0;
 544}
 545
 546void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
 547{
 548        int ret;
 549
 550        if (S_ISDIR(inode->i_mode))
 551                return;
 552
 553        if (ext4_should_journal_data(inode)) {
 554                ext4_fc_mark_ineligible(inode->i_sb,
 555                                        EXT4_FC_REASON_INODE_JOURNAL_DATA);
 556                return;
 557        }
 558
 559        ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
 560        trace_ext4_fc_track_inode(inode, ret);
 561}
 562
 563struct __track_range_args {
 564        ext4_lblk_t start, end;
 565};
 566
 567/* __track_fn for tracking data updates */
 568static int __track_range(struct inode *inode, void *arg, bool update)
 569{
 570        struct ext4_inode_info *ei = EXT4_I(inode);
 571        ext4_lblk_t oldstart;
 572        struct __track_range_args *__arg =
 573                (struct __track_range_args *)arg;
 574
 575        if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
 576                ext4_debug("Special inode %ld being modified\n", inode->i_ino);
 577                return -ECANCELED;
 578        }
 579
 580        oldstart = ei->i_fc_lblk_start;
 581
 582        if (update && ei->i_fc_lblk_len > 0) {
 583                ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
 584                ei->i_fc_lblk_len =
 585                        max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
 586                                ei->i_fc_lblk_start + 1;
 587        } else {
 588                ei->i_fc_lblk_start = __arg->start;
 589                ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
 590        }
 591
 592        return 0;
 593}
 594
 595void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
 596                         ext4_lblk_t end)
 597{
 598        struct __track_range_args args;
 599        int ret;
 600
 601        if (S_ISDIR(inode->i_mode))
 602                return;
 603
 604        args.start = start;
 605        args.end = end;
 606
 607        ret = ext4_fc_track_template(handle, inode,  __track_range, &args, 1);
 608
 609        trace_ext4_fc_track_range(inode, start, end, ret);
 610}
 611
 612static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
 613{
 614        int write_flags = REQ_SYNC;
 615        struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
 616
 617        /* Add REQ_FUA | REQ_PREFLUSH only its tail */
 618        if (test_opt(sb, BARRIER) && is_tail)
 619                write_flags |= REQ_FUA | REQ_PREFLUSH;
 620        lock_buffer(bh);
 621        set_buffer_dirty(bh);
 622        set_buffer_uptodate(bh);
 623        bh->b_end_io = ext4_end_buffer_io_sync;
 624        submit_bh(REQ_OP_WRITE, write_flags, bh);
 625        EXT4_SB(sb)->s_fc_bh = NULL;
 626}
 627
 628/* Ext4 commit path routines */
 629
 630/* memzero and update CRC */
 631static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
 632                                u32 *crc)
 633{
 634        void *ret;
 635
 636        ret = memset(dst, 0, len);
 637        if (crc)
 638                *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
 639        return ret;
 640}
 641
 642/*
 643 * Allocate len bytes on a fast commit buffer.
 644 *
 645 * During the commit time this function is used to manage fast commit
 646 * block space. We don't split a fast commit log onto different
 647 * blocks. So this function makes sure that if there's not enough space
 648 * on the current block, the remaining space in the current block is
 649 * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
 650 * new block is from jbd2 and CRC is updated to reflect the padding
 651 * we added.
 652 */
 653static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
 654{
 655        struct ext4_fc_tl *tl;
 656        struct ext4_sb_info *sbi = EXT4_SB(sb);
 657        struct buffer_head *bh;
 658        int bsize = sbi->s_journal->j_blocksize;
 659        int ret, off = sbi->s_fc_bytes % bsize;
 660        int pad_len;
 661
 662        /*
 663         * After allocating len, we should have space at least for a 0 byte
 664         * padding.
 665         */
 666        if (len + sizeof(struct ext4_fc_tl) > bsize)
 667                return NULL;
 668
 669        if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
 670                /*
 671                 * Only allocate from current buffer if we have enough space for
 672                 * this request AND we have space to add a zero byte padding.
 673                 */
 674                if (!sbi->s_fc_bh) {
 675                        ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
 676                        if (ret)
 677                                return NULL;
 678                        sbi->s_fc_bh = bh;
 679                }
 680                sbi->s_fc_bytes += len;
 681                return sbi->s_fc_bh->b_data + off;
 682        }
 683        /* Need to add PAD tag */
 684        tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
 685        tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
 686        pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
 687        tl->fc_len = cpu_to_le16(pad_len);
 688        if (crc)
 689                *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
 690        if (pad_len > 0)
 691                ext4_fc_memzero(sb, tl + 1, pad_len, crc);
 692        ext4_fc_submit_bh(sb, false);
 693
 694        ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
 695        if (ret)
 696                return NULL;
 697        sbi->s_fc_bh = bh;
 698        sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
 699        return sbi->s_fc_bh->b_data;
 700}
 701
 702/* memcpy to fc reserved space and update CRC */
 703static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
 704                                int len, u32 *crc)
 705{
 706        if (crc)
 707                *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
 708        return memcpy(dst, src, len);
 709}
 710
 711/*
 712 * Complete a fast commit by writing tail tag.
 713 *
 714 * Writing tail tag marks the end of a fast commit. In order to guarantee
 715 * atomicity, after writing tail tag, even if there's space remaining
 716 * in the block, next commit shouldn't use it. That's why tail tag
 717 * has the length as that of the remaining space on the block.
 718 */
 719static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
 720{
 721        struct ext4_sb_info *sbi = EXT4_SB(sb);
 722        struct ext4_fc_tl tl;
 723        struct ext4_fc_tail tail;
 724        int off, bsize = sbi->s_journal->j_blocksize;
 725        u8 *dst;
 726
 727        /*
 728         * ext4_fc_reserve_space takes care of allocating an extra block if
 729         * there's no enough space on this block for accommodating this tail.
 730         */
 731        dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
 732        if (!dst)
 733                return -ENOSPC;
 734
 735        off = sbi->s_fc_bytes % bsize;
 736
 737        tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
 738        tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
 739        sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
 740
 741        ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
 742        dst += sizeof(tl);
 743        tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
 744        ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
 745        dst += sizeof(tail.fc_tid);
 746        tail.fc_crc = cpu_to_le32(crc);
 747        ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
 748
 749        ext4_fc_submit_bh(sb, true);
 750
 751        return 0;
 752}
 753
 754/*
 755 * Adds tag, length, value and updates CRC. Returns true if tlv was added.
 756 * Returns false if there's not enough space.
 757 */
 758static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
 759                           u32 *crc)
 760{
 761        struct ext4_fc_tl tl;
 762        u8 *dst;
 763
 764        dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
 765        if (!dst)
 766                return false;
 767
 768        tl.fc_tag = cpu_to_le16(tag);
 769        tl.fc_len = cpu_to_le16(len);
 770
 771        ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
 772        ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
 773
 774        return true;
 775}
 776
 777/* Same as above, but adds dentry tlv. */
 778static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
 779                                   struct ext4_fc_dentry_update *fc_dentry)
 780{
 781        struct ext4_fc_dentry_info fcd;
 782        struct ext4_fc_tl tl;
 783        int dlen = fc_dentry->fcd_name.len;
 784        u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
 785                                        crc);
 786
 787        if (!dst)
 788                return false;
 789
 790        fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent);
 791        fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino);
 792        tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op);
 793        tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
 794        ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
 795        dst += sizeof(tl);
 796        ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
 797        dst += sizeof(fcd);
 798        ext4_fc_memcpy(sb, dst, fc_dentry->fcd_name.name, dlen, crc);
 799        dst += dlen;
 800
 801        return true;
 802}
 803
 804/*
 805 * Writes inode in the fast commit space under TLV with tag @tag.
 806 * Returns 0 on success, error on failure.
 807 */
 808static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
 809{
 810        struct ext4_inode_info *ei = EXT4_I(inode);
 811        int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
 812        int ret;
 813        struct ext4_iloc iloc;
 814        struct ext4_fc_inode fc_inode;
 815        struct ext4_fc_tl tl;
 816        u8 *dst;
 817
 818        ret = ext4_get_inode_loc(inode, &iloc);
 819        if (ret)
 820                return ret;
 821
 822        if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
 823                inode_len += ei->i_extra_isize;
 824
 825        fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
 826        tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
 827        tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
 828
 829        dst = ext4_fc_reserve_space(inode->i_sb,
 830                        sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
 831        if (!dst)
 832                return -ECANCELED;
 833
 834        if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
 835                return -ECANCELED;
 836        dst += sizeof(tl);
 837        if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
 838                return -ECANCELED;
 839        dst += sizeof(fc_inode);
 840        if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
 841                                        inode_len, crc))
 842                return -ECANCELED;
 843
 844        return 0;
 845}
 846
 847/*
 848 * Writes updated data ranges for the inode in question. Updates CRC.
 849 * Returns 0 on success, error otherwise.
 850 */
 851static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
 852{
 853        ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
 854        struct ext4_inode_info *ei = EXT4_I(inode);
 855        struct ext4_map_blocks map;
 856        struct ext4_fc_add_range fc_ext;
 857        struct ext4_fc_del_range lrange;
 858        struct ext4_extent *ex;
 859        int ret;
 860
 861        mutex_lock(&ei->i_fc_lock);
 862        if (ei->i_fc_lblk_len == 0) {
 863                mutex_unlock(&ei->i_fc_lock);
 864                return 0;
 865        }
 866        old_blk_size = ei->i_fc_lblk_start;
 867        new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
 868        ei->i_fc_lblk_len = 0;
 869        mutex_unlock(&ei->i_fc_lock);
 870
 871        cur_lblk_off = old_blk_size;
 872        jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
 873                  __func__, cur_lblk_off, new_blk_size, inode->i_ino);
 874
 875        while (cur_lblk_off <= new_blk_size) {
 876                map.m_lblk = cur_lblk_off;
 877                map.m_len = new_blk_size - cur_lblk_off + 1;
 878                ret = ext4_map_blocks(NULL, inode, &map, 0);
 879                if (ret < 0)
 880                        return -ECANCELED;
 881
 882                if (map.m_len == 0) {
 883                        cur_lblk_off++;
 884                        continue;
 885                }
 886
 887                if (ret == 0) {
 888                        lrange.fc_ino = cpu_to_le32(inode->i_ino);
 889                        lrange.fc_lblk = cpu_to_le32(map.m_lblk);
 890                        lrange.fc_len = cpu_to_le32(map.m_len);
 891                        if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
 892                                            sizeof(lrange), (u8 *)&lrange, crc))
 893                                return -ENOSPC;
 894                } else {
 895                        unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
 896                                EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;
 897
 898                        /* Limit the number of blocks in one extent */
 899                        map.m_len = min(max, map.m_len);
 900
 901                        fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
 902                        ex = (struct ext4_extent *)&fc_ext.fc_ex;
 903                        ex->ee_block = cpu_to_le32(map.m_lblk);
 904                        ex->ee_len = cpu_to_le16(map.m_len);
 905                        ext4_ext_store_pblock(ex, map.m_pblk);
 906                        if (map.m_flags & EXT4_MAP_UNWRITTEN)
 907                                ext4_ext_mark_unwritten(ex);
 908                        else
 909                                ext4_ext_mark_initialized(ex);
 910                        if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
 911                                            sizeof(fc_ext), (u8 *)&fc_ext, crc))
 912                                return -ENOSPC;
 913                }
 914
 915                cur_lblk_off += map.m_len;
 916        }
 917
 918        return 0;
 919}
 920
 921
 922/* Submit data for all the fast commit inodes */
 923static int ext4_fc_submit_inode_data_all(journal_t *journal)
 924{
 925        struct super_block *sb = (struct super_block *)(journal->j_private);
 926        struct ext4_sb_info *sbi = EXT4_SB(sb);
 927        struct ext4_inode_info *ei;
 928        int ret = 0;
 929
 930        spin_lock(&sbi->s_fc_lock);
 931        ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING);
 932        list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
 933                ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
 934                while (atomic_read(&ei->i_fc_updates)) {
 935                        DEFINE_WAIT(wait);
 936
 937                        prepare_to_wait(&ei->i_fc_wait, &wait,
 938                                                TASK_UNINTERRUPTIBLE);
 939                        if (atomic_read(&ei->i_fc_updates)) {
 940                                spin_unlock(&sbi->s_fc_lock);
 941                                schedule();
 942                                spin_lock(&sbi->s_fc_lock);
 943                        }
 944                        finish_wait(&ei->i_fc_wait, &wait);
 945                }
 946                spin_unlock(&sbi->s_fc_lock);
 947                ret = jbd2_submit_inode_data(ei->jinode);
 948                if (ret)
 949                        return ret;
 950                spin_lock(&sbi->s_fc_lock);
 951        }
 952        spin_unlock(&sbi->s_fc_lock);
 953
 954        return ret;
 955}
 956
 957/* Wait for completion of data for all the fast commit inodes */
 958static int ext4_fc_wait_inode_data_all(journal_t *journal)
 959{
 960        struct super_block *sb = (struct super_block *)(journal->j_private);
 961        struct ext4_sb_info *sbi = EXT4_SB(sb);
 962        struct ext4_inode_info *pos, *n;
 963        int ret = 0;
 964
 965        spin_lock(&sbi->s_fc_lock);
 966        list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
 967                if (!ext4_test_inode_state(&pos->vfs_inode,
 968                                           EXT4_STATE_FC_COMMITTING))
 969                        continue;
 970                spin_unlock(&sbi->s_fc_lock);
 971
 972                ret = jbd2_wait_inode_data(journal, pos->jinode);
 973                if (ret)
 974                        return ret;
 975                spin_lock(&sbi->s_fc_lock);
 976        }
 977        spin_unlock(&sbi->s_fc_lock);
 978
 979        return 0;
 980}
 981
 982/* Commit all the directory entry updates */
 983static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
 984__acquires(&sbi->s_fc_lock)
 985__releases(&sbi->s_fc_lock)
 986{
 987        struct super_block *sb = (struct super_block *)(journal->j_private);
 988        struct ext4_sb_info *sbi = EXT4_SB(sb);
 989        struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n;
 990        struct inode *inode;
 991        struct ext4_inode_info *ei, *ei_n;
 992        int ret;
 993
 994        if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
 995                return 0;
 996        list_for_each_entry_safe(fc_dentry, fc_dentry_n,
 997                                 &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
 998                if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
 999                        spin_unlock(&sbi->s_fc_lock);
1000                        if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
1001                                ret = -ENOSPC;
1002                                goto lock_and_exit;
1003                        }
1004                        spin_lock(&sbi->s_fc_lock);
1005                        continue;
1006                }
1007
1008                inode = NULL;
1009                list_for_each_entry_safe(ei, ei_n, &sbi->s_fc_q[FC_Q_MAIN],
1010                                         i_fc_list) {
1011                        if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
1012                                inode = &ei->vfs_inode;
1013                                break;
1014                        }
1015                }
1016                /*
1017                 * If we don't find inode in our list, then it was deleted,
1018                 * in which case, we don't need to record it's create tag.
1019                 */
1020                if (!inode)
1021                        continue;
1022                spin_unlock(&sbi->s_fc_lock);
1023
1024                /*
1025                 * We first write the inode and then the create dirent. This
1026                 * allows the recovery code to create an unnamed inode first
1027                 * and then link it to a directory entry. This allows us
1028                 * to use namei.c routines almost as is and simplifies
1029                 * the recovery code.
1030                 */
1031                ret = ext4_fc_write_inode(inode, crc);
1032                if (ret)
1033                        goto lock_and_exit;
1034
1035                ret = ext4_fc_write_inode_data(inode, crc);
1036                if (ret)
1037                        goto lock_and_exit;
1038
1039                if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
1040                        ret = -ENOSPC;
1041                        goto lock_and_exit;
1042                }
1043
1044                spin_lock(&sbi->s_fc_lock);
1045        }
1046        return 0;
1047lock_and_exit:
1048        spin_lock(&sbi->s_fc_lock);
1049        return ret;
1050}
1051
1052static int ext4_fc_perform_commit(journal_t *journal)
1053{
1054        struct super_block *sb = (struct super_block *)(journal->j_private);
1055        struct ext4_sb_info *sbi = EXT4_SB(sb);
1056        struct ext4_inode_info *iter;
1057        struct ext4_fc_head head;
1058        struct inode *inode;
1059        struct blk_plug plug;
1060        int ret = 0;
1061        u32 crc = 0;
1062
1063        ret = ext4_fc_submit_inode_data_all(journal);
1064        if (ret)
1065                return ret;
1066
1067        ret = ext4_fc_wait_inode_data_all(journal);
1068        if (ret)
1069                return ret;
1070
1071        /*
1072         * If file system device is different from journal device, issue a cache
1073         * flush before we start writing fast commit blocks.
1074         */
1075        if (journal->j_fs_dev != journal->j_dev)
1076                blkdev_issue_flush(journal->j_fs_dev);
1077
1078        blk_start_plug(&plug);
1079        if (sbi->s_fc_bytes == 0) {
1080                /*
1081                 * Add a head tag only if this is the first fast commit
1082                 * in this TID.
1083                 */
1084                head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1085                head.fc_tid = cpu_to_le32(
1086                        sbi->s_journal->j_running_transaction->t_tid);
1087                if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1088                        (u8 *)&head, &crc)) {
1089                        ret = -ENOSPC;
1090                        goto out;
1091                }
1092        }
1093
1094        spin_lock(&sbi->s_fc_lock);
1095        ret = ext4_fc_commit_dentry_updates(journal, &crc);
1096        if (ret) {
1097                spin_unlock(&sbi->s_fc_lock);
1098                goto out;
1099        }
1100
1101        list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1102                inode = &iter->vfs_inode;
1103                if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1104                        continue;
1105
1106                spin_unlock(&sbi->s_fc_lock);
1107                ret = ext4_fc_write_inode_data(inode, &crc);
1108                if (ret)
1109                        goto out;
1110                ret = ext4_fc_write_inode(inode, &crc);
1111                if (ret)
1112                        goto out;
1113                spin_lock(&sbi->s_fc_lock);
1114        }
1115        spin_unlock(&sbi->s_fc_lock);
1116
1117        ret = ext4_fc_write_tail(sb, crc);
1118
1119out:
1120        blk_finish_plug(&plug);
1121        return ret;
1122}
1123
1124/*
1125 * The main commit entry point. Performs a fast commit for transaction
1126 * commit_tid if needed. If it's not possible to perform a fast commit
1127 * due to various reasons, we fall back to full commit. Returns 0
1128 * on success, error otherwise.
1129 */
1130int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1131{
1132        struct super_block *sb = (struct super_block *)(journal->j_private);
1133        struct ext4_sb_info *sbi = EXT4_SB(sb);
1134        int nblks = 0, ret, bsize = journal->j_blocksize;
1135        int subtid = atomic_read(&sbi->s_fc_subtid);
1136        int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0;
1137        ktime_t start_time, commit_time;
1138
1139        trace_ext4_fc_commit_start(sb);
1140
1141        start_time = ktime_get();
1142
1143        if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
1144                (ext4_fc_is_ineligible(sb))) {
1145                reason = EXT4_FC_REASON_INELIGIBLE;
1146                goto out;
1147        }
1148
1149restart_fc:
1150        ret = jbd2_fc_begin_commit(journal, commit_tid);
1151        if (ret == -EALREADY) {
1152                /* There was an ongoing commit, check if we need to restart */
1153                if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1154                        commit_tid > journal->j_commit_sequence)
1155                        goto restart_fc;
1156                reason = EXT4_FC_REASON_ALREADY_COMMITTED;
1157                goto out;
1158        } else if (ret) {
1159                sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1160                reason = EXT4_FC_REASON_FC_START_FAILED;
1161                goto out;
1162        }
1163
1164        fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1165        ret = ext4_fc_perform_commit(journal);
1166        if (ret < 0) {
1167                sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1168                reason = EXT4_FC_REASON_FC_FAILED;
1169                goto out;
1170        }
1171        nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1172        ret = jbd2_fc_wait_bufs(journal, nblks);
1173        if (ret < 0) {
1174                sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1175                reason = EXT4_FC_REASON_FC_FAILED;
1176                goto out;
1177        }
1178        atomic_inc(&sbi->s_fc_subtid);
1179        jbd2_fc_end_commit(journal);
1180out:
1181        /* Has any ineligible update happened since we started? */
1182        if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) {
1183                sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1184                reason = EXT4_FC_REASON_INELIGIBLE;
1185        }
1186
1187        spin_lock(&sbi->s_fc_lock);
1188        if (reason != EXT4_FC_REASON_OK &&
1189                reason != EXT4_FC_REASON_ALREADY_COMMITTED) {
1190                sbi->s_fc_stats.fc_ineligible_commits++;
1191        } else {
1192                sbi->s_fc_stats.fc_num_commits++;
1193                sbi->s_fc_stats.fc_numblks += nblks;
1194        }
1195        spin_unlock(&sbi->s_fc_lock);
1196        nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0;
1197        trace_ext4_fc_commit_stop(sb, nblks, reason);
1198        commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1199        /*
1200         * weight the commit time higher than the average time so we don't
1201         * react too strongly to vast changes in the commit time
1202         */
1203        if (likely(sbi->s_fc_avg_commit_time))
1204                sbi->s_fc_avg_commit_time = (commit_time +
1205                                sbi->s_fc_avg_commit_time * 3) / 4;
1206        else
1207                sbi->s_fc_avg_commit_time = commit_time;
1208        jbd_debug(1,
1209                "Fast commit ended with blks = %d, reason = %d, subtid - %d",
1210                nblks, reason, subtid);
1211        if (reason == EXT4_FC_REASON_FC_FAILED)
1212                return jbd2_fc_end_commit_fallback(journal);
1213        if (reason == EXT4_FC_REASON_FC_START_FAILED ||
1214                reason == EXT4_FC_REASON_INELIGIBLE)
1215                return jbd2_complete_transaction(journal, commit_tid);
1216        return 0;
1217}
1218
1219/*
1220 * Fast commit cleanup routine. This is called after every fast commit and
1221 * full commit. full is true if we are called after a full commit.
1222 */
1223static void ext4_fc_cleanup(journal_t *journal, int full)
1224{
1225        struct super_block *sb = journal->j_private;
1226        struct ext4_sb_info *sbi = EXT4_SB(sb);
1227        struct ext4_inode_info *iter, *iter_n;
1228        struct ext4_fc_dentry_update *fc_dentry;
1229
1230        if (full && sbi->s_fc_bh)
1231                sbi->s_fc_bh = NULL;
1232
1233        jbd2_fc_release_bufs(journal);
1234
1235        spin_lock(&sbi->s_fc_lock);
1236        list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN],
1237                                 i_fc_list) {
1238                list_del_init(&iter->i_fc_list);
1239                ext4_clear_inode_state(&iter->vfs_inode,
1240                                       EXT4_STATE_FC_COMMITTING);
1241                ext4_fc_reset_inode(&iter->vfs_inode);
1242                /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1243                smp_mb();
1244#if (BITS_PER_LONG < 64)
1245                wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1246#else
1247                wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1248#endif
1249        }
1250
1251        while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1252                fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1253                                             struct ext4_fc_dentry_update,
1254                                             fcd_list);
1255                list_del_init(&fc_dentry->fcd_list);
1256                spin_unlock(&sbi->s_fc_lock);
1257
1258                if (fc_dentry->fcd_name.name &&
1259                        fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1260                        kfree(fc_dentry->fcd_name.name);
1261                kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1262                spin_lock(&sbi->s_fc_lock);
1263        }
1264
1265        list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1266                                &sbi->s_fc_dentry_q[FC_Q_MAIN]);
1267        list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1268                                &sbi->s_fc_q[FC_Q_MAIN]);
1269
1270        ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING);
1271        ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
1272
1273        if (full)
1274                sbi->s_fc_bytes = 0;
1275        spin_unlock(&sbi->s_fc_lock);
1276        trace_ext4_fc_stats(sb);
1277}
1278
1279/* Ext4 Replay Path Routines */
1280
1281/* Helper struct for dentry replay routines */
1282struct dentry_info_args {
1283        int parent_ino, dname_len, ino, inode_len;
1284        char *dname;
1285};
1286
1287static inline void tl_to_darg(struct dentry_info_args *darg,
1288                              struct  ext4_fc_tl *tl, u8 *val)
1289{
1290        struct ext4_fc_dentry_info fcd;
1291
1292        memcpy(&fcd, val, sizeof(fcd));
1293
1294        darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
1295        darg->ino = le32_to_cpu(fcd.fc_ino);
1296        darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
1297        darg->dname_len = le16_to_cpu(tl->fc_len) -
1298                sizeof(struct ext4_fc_dentry_info);
1299}
1300
1301/* Unlink replay function */
1302static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl,
1303                                 u8 *val)
1304{
1305        struct inode *inode, *old_parent;
1306        struct qstr entry;
1307        struct dentry_info_args darg;
1308        int ret = 0;
1309
1310        tl_to_darg(&darg, tl, val);
1311
1312        trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1313                        darg.parent_ino, darg.dname_len);
1314
1315        entry.name = darg.dname;
1316        entry.len = darg.dname_len;
1317        inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1318
1319        if (IS_ERR(inode)) {
1320                jbd_debug(1, "Inode %d not found", darg.ino);
1321                return 0;
1322        }
1323
1324        old_parent = ext4_iget(sb, darg.parent_ino,
1325                                EXT4_IGET_NORMAL);
1326        if (IS_ERR(old_parent)) {
1327                jbd_debug(1, "Dir with inode  %d not found", darg.parent_ino);
1328                iput(inode);
1329                return 0;
1330        }
1331
1332        ret = __ext4_unlink(NULL, old_parent, &entry, inode);
1333        /* -ENOENT ok coz it might not exist anymore. */
1334        if (ret == -ENOENT)
1335                ret = 0;
1336        iput(old_parent);
1337        iput(inode);
1338        return ret;
1339}
1340
1341static int ext4_fc_replay_link_internal(struct super_block *sb,
1342                                struct dentry_info_args *darg,
1343                                struct inode *inode)
1344{
1345        struct inode *dir = NULL;
1346        struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1347        struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1348        int ret = 0;
1349
1350        dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1351        if (IS_ERR(dir)) {
1352                jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
1353                dir = NULL;
1354                goto out;
1355        }
1356
1357        dentry_dir = d_obtain_alias(dir);
1358        if (IS_ERR(dentry_dir)) {
1359                jbd_debug(1, "Failed to obtain dentry");
1360                dentry_dir = NULL;
1361                goto out;
1362        }
1363
1364        dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1365        if (!dentry_inode) {
1366                jbd_debug(1, "Inode dentry not created.");
1367                ret = -ENOMEM;
1368                goto out;
1369        }
1370
1371        ret = __ext4_link(dir, inode, dentry_inode);
1372        /*
1373         * It's possible that link already existed since data blocks
1374         * for the dir in question got persisted before we crashed OR
1375         * we replayed this tag and crashed before the entire replay
1376         * could complete.
1377         */
1378        if (ret && ret != -EEXIST) {
1379                jbd_debug(1, "Failed to link\n");
1380                goto out;
1381        }
1382
1383        ret = 0;
1384out:
1385        if (dentry_dir) {
1386                d_drop(dentry_dir);
1387                dput(dentry_dir);
1388        } else if (dir) {
1389                iput(dir);
1390        }
1391        if (dentry_inode) {
1392                d_drop(dentry_inode);
1393                dput(dentry_inode);
1394        }
1395
1396        return ret;
1397}
1398
1399/* Link replay function */
1400static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl,
1401                               u8 *val)
1402{
1403        struct inode *inode;
1404        struct dentry_info_args darg;
1405        int ret = 0;
1406
1407        tl_to_darg(&darg, tl, val);
1408        trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1409                        darg.parent_ino, darg.dname_len);
1410
1411        inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1412        if (IS_ERR(inode)) {
1413                jbd_debug(1, "Inode not found.");
1414                return 0;
1415        }
1416
1417        ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1418        iput(inode);
1419        return ret;
1420}
1421
1422/*
1423 * Record all the modified inodes during replay. We use this later to setup
1424 * block bitmaps correctly.
1425 */
1426static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1427{
1428        struct ext4_fc_replay_state *state;
1429        int i;
1430
1431        state = &EXT4_SB(sb)->s_fc_replay_state;
1432        for (i = 0; i < state->fc_modified_inodes_used; i++)
1433                if (state->fc_modified_inodes[i] == ino)
1434                        return 0;
1435        if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1436                state->fc_modified_inodes_size +=
1437                        EXT4_FC_REPLAY_REALLOC_INCREMENT;
1438                state->fc_modified_inodes = krealloc(
1439                                        state->fc_modified_inodes, sizeof(int) *
1440                                        state->fc_modified_inodes_size,
1441                                        GFP_KERNEL);
1442                if (!state->fc_modified_inodes)
1443                        return -ENOMEM;
1444        }
1445        state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1446        return 0;
1447}
1448
1449/*
1450 * Inode replay function
1451 */
1452static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
1453                                u8 *val)
1454{
1455        struct ext4_fc_inode fc_inode;
1456        struct ext4_inode *raw_inode;
1457        struct ext4_inode *raw_fc_inode;
1458        struct inode *inode = NULL;
1459        struct ext4_iloc iloc;
1460        int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
1461        struct ext4_extent_header *eh;
1462
1463        memcpy(&fc_inode, val, sizeof(fc_inode));
1464
1465        ino = le32_to_cpu(fc_inode.fc_ino);
1466        trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1467
1468        inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1469        if (!IS_ERR(inode)) {
1470                ext4_ext_clear_bb(inode);
1471                iput(inode);
1472        }
1473        inode = NULL;
1474
1475        ext4_fc_record_modified_inode(sb, ino);
1476
1477        raw_fc_inode = (struct ext4_inode *)
1478                (val + offsetof(struct ext4_fc_inode, fc_raw_inode));
1479        ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1480        if (ret)
1481                goto out;
1482
1483        inode_len = le16_to_cpu(tl->fc_len) - sizeof(struct ext4_fc_inode);
1484        raw_inode = ext4_raw_inode(&iloc);
1485
1486        memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1487        memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
1488                inode_len - offsetof(struct ext4_inode, i_generation));
1489        if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1490                eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1491                if (eh->eh_magic != EXT4_EXT_MAGIC) {
1492                        memset(eh, 0, sizeof(*eh));
1493                        eh->eh_magic = EXT4_EXT_MAGIC;
1494                        eh->eh_max = cpu_to_le16(
1495                                (sizeof(raw_inode->i_block) -
1496                                 sizeof(struct ext4_extent_header))
1497                                 / sizeof(struct ext4_extent));
1498                }
1499        } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1500                memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1501                        sizeof(raw_inode->i_block));
1502        }
1503
1504        /* Immediately update the inode on disk. */
1505        ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1506        if (ret)
1507                goto out;
1508        ret = sync_dirty_buffer(iloc.bh);
1509        if (ret)
1510                goto out;
1511        ret = ext4_mark_inode_used(sb, ino);
1512        if (ret)
1513                goto out;
1514
1515        /* Given that we just wrote the inode on disk, this SHOULD succeed. */
1516        inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1517        if (IS_ERR(inode)) {
1518                jbd_debug(1, "Inode not found.");
1519                return -EFSCORRUPTED;
1520        }
1521
1522        /*
1523         * Our allocator could have made different decisions than before
1524         * crashing. This should be fixed but until then, we calculate
1525         * the number of blocks the inode.
1526         */
1527        ext4_ext_replay_set_iblocks(inode);
1528
1529        inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1530        ext4_reset_inode_seed(inode);
1531
1532        ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1533        ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1534        sync_dirty_buffer(iloc.bh);
1535        brelse(iloc.bh);
1536out:
1537        iput(inode);
1538        if (!ret)
1539                blkdev_issue_flush(sb->s_bdev);
1540
1541        return 0;
1542}
1543
1544/*
1545 * Dentry create replay function.
1546 *
1547 * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1548 * inode for which we are trying to create a dentry here, should already have
1549 * been replayed before we start here.
1550 */
1551static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl,
1552                                 u8 *val)
1553{
1554        int ret = 0;
1555        struct inode *inode = NULL;
1556        struct inode *dir = NULL;
1557        struct dentry_info_args darg;
1558
1559        tl_to_darg(&darg, tl, val);
1560
1561        trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1562                        darg.parent_ino, darg.dname_len);
1563
1564        /* This takes care of update group descriptor and other metadata */
1565        ret = ext4_mark_inode_used(sb, darg.ino);
1566        if (ret)
1567                goto out;
1568
1569        inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1570        if (IS_ERR(inode)) {
1571                jbd_debug(1, "inode %d not found.", darg.ino);
1572                inode = NULL;
1573                ret = -EINVAL;
1574                goto out;
1575        }
1576
1577        if (S_ISDIR(inode->i_mode)) {
1578                /*
1579                 * If we are creating a directory, we need to make sure that the
1580                 * dot and dot dot dirents are setup properly.
1581                 */
1582                dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1583                if (IS_ERR(dir)) {
1584                        jbd_debug(1, "Dir %d not found.", darg.ino);
1585                        goto out;
1586                }
1587                ret = ext4_init_new_dir(NULL, dir, inode);
1588                iput(dir);
1589                if (ret) {
1590                        ret = 0;
1591                        goto out;
1592                }
1593        }
1594        ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1595        if (ret)
1596                goto out;
1597        set_nlink(inode, 1);
1598        ext4_mark_inode_dirty(NULL, inode);
1599out:
1600        if (inode)
1601                iput(inode);
1602        return ret;
1603}
1604
1605/*
1606 * Record physical disk regions which are in use as per fast commit area. Our
1607 * simple replay phase allocator excludes these regions from allocation.
1608 */
1609static int ext4_fc_record_regions(struct super_block *sb, int ino,
1610                ext4_lblk_t lblk, ext4_fsblk_t pblk, int len)
1611{
1612        struct ext4_fc_replay_state *state;
1613        struct ext4_fc_alloc_region *region;
1614
1615        state = &EXT4_SB(sb)->s_fc_replay_state;
1616        if (state->fc_regions_used == state->fc_regions_size) {
1617                state->fc_regions_size +=
1618                        EXT4_FC_REPLAY_REALLOC_INCREMENT;
1619                state->fc_regions = krealloc(
1620                                        state->fc_regions,
1621                                        state->fc_regions_size *
1622                                        sizeof(struct ext4_fc_alloc_region),
1623                                        GFP_KERNEL);
1624                if (!state->fc_regions)
1625                        return -ENOMEM;
1626        }
1627        region = &state->fc_regions[state->fc_regions_used++];
1628        region->ino = ino;
1629        region->lblk = lblk;
1630        region->pblk = pblk;
1631        region->len = len;
1632
1633        return 0;
1634}
1635
1636/* Replay add range tag */
1637static int ext4_fc_replay_add_range(struct super_block *sb,
1638                                    struct ext4_fc_tl *tl, u8 *val)
1639{
1640        struct ext4_fc_add_range fc_add_ex;
1641        struct ext4_extent newex, *ex;
1642        struct inode *inode;
1643        ext4_lblk_t start, cur;
1644        int remaining, len;
1645        ext4_fsblk_t start_pblk;
1646        struct ext4_map_blocks map;
1647        struct ext4_ext_path *path = NULL;
1648        int ret;
1649
1650        memcpy(&fc_add_ex, val, sizeof(fc_add_ex));
1651        ex = (struct ext4_extent *)&fc_add_ex.fc_ex;
1652
1653        trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1654                le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block),
1655                ext4_ext_get_actual_len(ex));
1656
1657        inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
1658        if (IS_ERR(inode)) {
1659                jbd_debug(1, "Inode not found.");
1660                return 0;
1661        }
1662
1663        ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1664
1665        start = le32_to_cpu(ex->ee_block);
1666        start_pblk = ext4_ext_pblock(ex);
1667        len = ext4_ext_get_actual_len(ex);
1668
1669        cur = start;
1670        remaining = len;
1671        jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1672                  start, start_pblk, len, ext4_ext_is_unwritten(ex),
1673                  inode->i_ino);
1674
1675        while (remaining > 0) {
1676                map.m_lblk = cur;
1677                map.m_len = remaining;
1678                map.m_pblk = 0;
1679                ret = ext4_map_blocks(NULL, inode, &map, 0);
1680
1681                if (ret < 0) {
1682                        iput(inode);
1683                        return 0;
1684                }
1685
1686                if (ret == 0) {
1687                        /* Range is not mapped */
1688                        path = ext4_find_extent(inode, cur, NULL, 0);
1689                        if (IS_ERR(path)) {
1690                                iput(inode);
1691                                return 0;
1692                        }
1693                        memset(&newex, 0, sizeof(newex));
1694                        newex.ee_block = cpu_to_le32(cur);
1695                        ext4_ext_store_pblock(
1696                                &newex, start_pblk + cur - start);
1697                        newex.ee_len = cpu_to_le16(map.m_len);
1698                        if (ext4_ext_is_unwritten(ex))
1699                                ext4_ext_mark_unwritten(&newex);
1700                        down_write(&EXT4_I(inode)->i_data_sem);
1701                        ret = ext4_ext_insert_extent(
1702                                NULL, inode, &path, &newex, 0);
1703                        up_write((&EXT4_I(inode)->i_data_sem));
1704                        ext4_ext_drop_refs(path);
1705                        kfree(path);
1706                        if (ret) {
1707                                iput(inode);
1708                                return 0;
1709                        }
1710                        goto next;
1711                }
1712
1713                if (start_pblk + cur - start != map.m_pblk) {
1714                        /*
1715                         * Logical to physical mapping changed. This can happen
1716                         * if this range was removed and then reallocated to
1717                         * map to new physical blocks during a fast commit.
1718                         */
1719                        ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1720                                        ext4_ext_is_unwritten(ex),
1721                                        start_pblk + cur - start);
1722                        if (ret) {
1723                                iput(inode);
1724                                return 0;
1725                        }
1726                        /*
1727                         * Mark the old blocks as free since they aren't used
1728                         * anymore. We maintain an array of all the modified
1729                         * inodes. In case these blocks are still used at either
1730                         * a different logical range in the same inode or in
1731                         * some different inode, we will mark them as allocated
1732                         * at the end of the FC replay using our array of
1733                         * modified inodes.
1734                         */
1735                        ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1736                        goto next;
1737                }
1738
1739                /* Range is mapped and needs a state change */
1740                jbd_debug(1, "Converting from %ld to %d %lld",
1741                                map.m_flags & EXT4_MAP_UNWRITTEN,
1742                        ext4_ext_is_unwritten(ex), map.m_pblk);
1743                ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1744                                        ext4_ext_is_unwritten(ex), map.m_pblk);
1745                if (ret) {
1746                        iput(inode);
1747                        return 0;
1748                }
1749                /*
1750                 * We may have split the extent tree while toggling the state.
1751                 * Try to shrink the extent tree now.
1752                 */
1753                ext4_ext_replay_shrink_inode(inode, start + len);
1754next:
1755                cur += map.m_len;
1756                remaining -= map.m_len;
1757        }
1758        ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1759                                        sb->s_blocksize_bits);
1760        iput(inode);
1761        return 0;
1762}
1763
1764/* Replay DEL_RANGE tag */
1765static int
1766ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl,
1767                         u8 *val)
1768{
1769        struct inode *inode;
1770        struct ext4_fc_del_range lrange;
1771        struct ext4_map_blocks map;
1772        ext4_lblk_t cur, remaining;
1773        int ret;
1774
1775        memcpy(&lrange, val, sizeof(lrange));
1776        cur = le32_to_cpu(lrange.fc_lblk);
1777        remaining = le32_to_cpu(lrange.fc_len);
1778
1779        trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1780                le32_to_cpu(lrange.fc_ino), cur, remaining);
1781
1782        inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
1783        if (IS_ERR(inode)) {
1784                jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange.fc_ino));
1785                return 0;
1786        }
1787
1788        ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1789
1790        jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
1791                        inode->i_ino, le32_to_cpu(lrange.fc_lblk),
1792                        le32_to_cpu(lrange.fc_len));
1793        while (remaining > 0) {
1794                map.m_lblk = cur;
1795                map.m_len = remaining;
1796
1797                ret = ext4_map_blocks(NULL, inode, &map, 0);
1798                if (ret < 0) {
1799                        iput(inode);
1800                        return 0;
1801                }
1802                if (ret > 0) {
1803                        remaining -= ret;
1804                        cur += ret;
1805                        ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1806                } else {
1807                        remaining -= map.m_len;
1808                        cur += map.m_len;
1809                }
1810        }
1811
1812        ret = ext4_punch_hole(inode,
1813                le32_to_cpu(lrange.fc_lblk) << sb->s_blocksize_bits,
1814                le32_to_cpu(lrange.fc_len) <<  sb->s_blocksize_bits);
1815        if (ret)
1816                jbd_debug(1, "ext4_punch_hole returned %d", ret);
1817        ext4_ext_replay_shrink_inode(inode,
1818                i_size_read(inode) >> sb->s_blocksize_bits);
1819        ext4_mark_inode_dirty(NULL, inode);
1820        iput(inode);
1821
1822        return 0;
1823}
1824
1825static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1826{
1827        struct ext4_fc_replay_state *state;
1828        struct inode *inode;
1829        struct ext4_ext_path *path = NULL;
1830        struct ext4_map_blocks map;
1831        int i, ret, j;
1832        ext4_lblk_t cur, end;
1833
1834        state = &EXT4_SB(sb)->s_fc_replay_state;
1835        for (i = 0; i < state->fc_modified_inodes_used; i++) {
1836                inode = ext4_iget(sb, state->fc_modified_inodes[i],
1837                        EXT4_IGET_NORMAL);
1838                if (IS_ERR(inode)) {
1839                        jbd_debug(1, "Inode %d not found.",
1840                                state->fc_modified_inodes[i]);
1841                        continue;
1842                }
1843                cur = 0;
1844                end = EXT_MAX_BLOCKS;
1845                while (cur < end) {
1846                        map.m_lblk = cur;
1847                        map.m_len = end - cur;
1848
1849                        ret = ext4_map_blocks(NULL, inode, &map, 0);
1850                        if (ret < 0)
1851                                break;
1852
1853                        if (ret > 0) {
1854                                path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1855                                if (!IS_ERR(path)) {
1856                                        for (j = 0; j < path->p_depth; j++)
1857                                                ext4_mb_mark_bb(inode->i_sb,
1858                                                        path[j].p_block, 1, 1);
1859                                        ext4_ext_drop_refs(path);
1860                                        kfree(path);
1861                                }
1862                                cur += ret;
1863                                ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1864                                                        map.m_len, 1);
1865                        } else {
1866                                cur = cur + (map.m_len ? map.m_len : 1);
1867                        }
1868                }
1869                iput(inode);
1870        }
1871}
1872
1873/*
1874 * Check if block is in excluded regions for block allocation. The simple
1875 * allocator that runs during replay phase is calls this function to see
1876 * if it is okay to use a block.
1877 */
1878bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1879{
1880        int i;
1881        struct ext4_fc_replay_state *state;
1882
1883        state = &EXT4_SB(sb)->s_fc_replay_state;
1884        for (i = 0; i < state->fc_regions_valid; i++) {
1885                if (state->fc_regions[i].ino == 0 ||
1886                        state->fc_regions[i].len == 0)
1887                        continue;
1888                if (blk >= state->fc_regions[i].pblk &&
1889                    blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
1890                        return true;
1891        }
1892        return false;
1893}
1894
1895/* Cleanup function called after replay */
1896void ext4_fc_replay_cleanup(struct super_block *sb)
1897{
1898        struct ext4_sb_info *sbi = EXT4_SB(sb);
1899
1900        sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1901        kfree(sbi->s_fc_replay_state.fc_regions);
1902        kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1903}
1904
1905/*
1906 * Recovery Scan phase handler
1907 *
1908 * This function is called during the scan phase and is responsible
1909 * for doing following things:
1910 * - Make sure the fast commit area has valid tags for replay
1911 * - Count number of tags that need to be replayed by the replay handler
1912 * - Verify CRC
1913 * - Create a list of excluded blocks for allocation during replay phase
1914 *
1915 * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1916 * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1917 * to indicate that scan has finished and JBD2 can now start replay phase.
1918 * It returns a negative error to indicate that there was an error. At the end
1919 * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1920 * to indicate the number of tags that need to replayed during the replay phase.
1921 */
1922static int ext4_fc_replay_scan(journal_t *journal,
1923                                struct buffer_head *bh, int off,
1924                                tid_t expected_tid)
1925{
1926        struct super_block *sb = journal->j_private;
1927        struct ext4_sb_info *sbi = EXT4_SB(sb);
1928        struct ext4_fc_replay_state *state;
1929        int ret = JBD2_FC_REPLAY_CONTINUE;
1930        struct ext4_fc_add_range ext;
1931        struct ext4_fc_tl tl;
1932        struct ext4_fc_tail tail;
1933        __u8 *start, *end, *cur, *val;
1934        struct ext4_fc_head head;
1935        struct ext4_extent *ex;
1936
1937        state = &sbi->s_fc_replay_state;
1938
1939        start = (u8 *)bh->b_data;
1940        end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
1941
1942        if (state->fc_replay_expected_off == 0) {
1943                state->fc_cur_tag = 0;
1944                state->fc_replay_num_tags = 0;
1945                state->fc_crc = 0;
1946                state->fc_regions = NULL;
1947                state->fc_regions_valid = state->fc_regions_used =
1948                        state->fc_regions_size = 0;
1949                /* Check if we can stop early */
1950                if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
1951                        != EXT4_FC_TAG_HEAD)
1952                        return 0;
1953        }
1954
1955        if (off != state->fc_replay_expected_off) {
1956                ret = -EFSCORRUPTED;
1957                goto out_err;
1958        }
1959
1960        state->fc_replay_expected_off++;
1961        for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
1962                memcpy(&tl, cur, sizeof(tl));
1963                val = cur + sizeof(tl);
1964                jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
1965                          tag2str(le16_to_cpu(tl.fc_tag)), bh->b_blocknr);
1966                switch (le16_to_cpu(tl.fc_tag)) {
1967                case EXT4_FC_TAG_ADD_RANGE:
1968                        memcpy(&ext, val, sizeof(ext));
1969                        ex = (struct ext4_extent *)&ext.fc_ex;
1970                        ret = ext4_fc_record_regions(sb,
1971                                le32_to_cpu(ext.fc_ino),
1972                                le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
1973                                ext4_ext_get_actual_len(ex));
1974                        if (ret < 0)
1975                                break;
1976                        ret = JBD2_FC_REPLAY_CONTINUE;
1977                        fallthrough;
1978                case EXT4_FC_TAG_DEL_RANGE:
1979                case EXT4_FC_TAG_LINK:
1980                case EXT4_FC_TAG_UNLINK:
1981                case EXT4_FC_TAG_CREAT:
1982                case EXT4_FC_TAG_INODE:
1983                case EXT4_FC_TAG_PAD:
1984                        state->fc_cur_tag++;
1985                        state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
1986                                        sizeof(tl) + le16_to_cpu(tl.fc_len));
1987                        break;
1988                case EXT4_FC_TAG_TAIL:
1989                        state->fc_cur_tag++;
1990                        memcpy(&tail, val, sizeof(tail));
1991                        state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
1992                                                sizeof(tl) +
1993                                                offsetof(struct ext4_fc_tail,
1994                                                fc_crc));
1995                        if (le32_to_cpu(tail.fc_tid) == expected_tid &&
1996                                le32_to_cpu(tail.fc_crc) == state->fc_crc) {
1997                                state->fc_replay_num_tags = state->fc_cur_tag;
1998                                state->fc_regions_valid =
1999                                        state->fc_regions_used;
2000                        } else {
2001                                ret = state->fc_replay_num_tags ?
2002                                        JBD2_FC_REPLAY_STOP : -EFSBADCRC;
2003                        }
2004                        state->fc_crc = 0;
2005                        break;
2006                case EXT4_FC_TAG_HEAD:
2007                        memcpy(&head, val, sizeof(head));
2008                        if (le32_to_cpu(head.fc_features) &
2009                                ~EXT4_FC_SUPPORTED_FEATURES) {
2010                                ret = -EOPNOTSUPP;
2011                                break;
2012                        }
2013                        if (le32_to_cpu(head.fc_tid) != expected_tid) {
2014                                ret = JBD2_FC_REPLAY_STOP;
2015                                break;
2016                        }
2017                        state->fc_cur_tag++;
2018                        state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2019                                            sizeof(tl) + le16_to_cpu(tl.fc_len));
2020                        break;
2021                default:
2022                        ret = state->fc_replay_num_tags ?
2023                                JBD2_FC_REPLAY_STOP : -ECANCELED;
2024                }
2025                if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
2026                        break;
2027        }
2028
2029out_err:
2030        trace_ext4_fc_replay_scan(sb, ret, off);
2031        return ret;
2032}
2033
2034/*
2035 * Main recovery path entry point.
2036 * The meaning of return codes is similar as above.
2037 */
2038static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2039                                enum passtype pass, int off, tid_t expected_tid)
2040{
2041        struct super_block *sb = journal->j_private;
2042        struct ext4_sb_info *sbi = EXT4_SB(sb);
2043        struct ext4_fc_tl tl;
2044        __u8 *start, *end, *cur, *val;
2045        int ret = JBD2_FC_REPLAY_CONTINUE;
2046        struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2047        struct ext4_fc_tail tail;
2048
2049        if (pass == PASS_SCAN) {
2050                state->fc_current_pass = PASS_SCAN;
2051                return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2052        }
2053
2054        if (state->fc_current_pass != pass) {
2055                state->fc_current_pass = pass;
2056                sbi->s_mount_state |= EXT4_FC_REPLAY;
2057        }
2058        if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2059                jbd_debug(1, "Replay stops\n");
2060                ext4_fc_set_bitmaps_and_counters(sb);
2061                return 0;
2062        }
2063
2064#ifdef CONFIG_EXT4_DEBUG
2065        if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2066                pr_warn("Dropping fc block %d because max_replay set\n", off);
2067                return JBD2_FC_REPLAY_STOP;
2068        }
2069#endif
2070
2071        start = (u8 *)bh->b_data;
2072        end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2073
2074        for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
2075                memcpy(&tl, cur, sizeof(tl));
2076                val = cur + sizeof(tl);
2077
2078                if (state->fc_replay_num_tags == 0) {
2079                        ret = JBD2_FC_REPLAY_STOP;
2080                        ext4_fc_set_bitmaps_and_counters(sb);
2081                        break;
2082                }
2083                jbd_debug(3, "Replay phase, tag:%s\n",
2084                                tag2str(le16_to_cpu(tl.fc_tag)));
2085                state->fc_replay_num_tags--;
2086                switch (le16_to_cpu(tl.fc_tag)) {
2087                case EXT4_FC_TAG_LINK:
2088                        ret = ext4_fc_replay_link(sb, &tl, val);
2089                        break;
2090                case EXT4_FC_TAG_UNLINK:
2091                        ret = ext4_fc_replay_unlink(sb, &tl, val);
2092                        break;
2093                case EXT4_FC_TAG_ADD_RANGE:
2094                        ret = ext4_fc_replay_add_range(sb, &tl, val);
2095                        break;
2096                case EXT4_FC_TAG_CREAT:
2097                        ret = ext4_fc_replay_create(sb, &tl, val);
2098                        break;
2099                case EXT4_FC_TAG_DEL_RANGE:
2100                        ret = ext4_fc_replay_del_range(sb, &tl, val);
2101                        break;
2102                case EXT4_FC_TAG_INODE:
2103                        ret = ext4_fc_replay_inode(sb, &tl, val);
2104                        break;
2105                case EXT4_FC_TAG_PAD:
2106                        trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2107                                             le16_to_cpu(tl.fc_len), 0);
2108                        break;
2109                case EXT4_FC_TAG_TAIL:
2110                        trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
2111                                             le16_to_cpu(tl.fc_len), 0);
2112                        memcpy(&tail, val, sizeof(tail));
2113                        WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
2114                        break;
2115                case EXT4_FC_TAG_HEAD:
2116                        break;
2117                default:
2118                        trace_ext4_fc_replay(sb, le16_to_cpu(tl.fc_tag), 0,
2119                                             le16_to_cpu(tl.fc_len), 0);
2120                        ret = -ECANCELED;
2121                        break;
2122                }
2123                if (ret < 0)
2124                        break;
2125                ret = JBD2_FC_REPLAY_CONTINUE;
2126        }
2127        return ret;
2128}
2129
2130void ext4_fc_init(struct super_block *sb, journal_t *journal)
2131{
2132        /*
2133         * We set replay callback even if fast commit disabled because we may
2134         * could still have fast commit blocks that need to be replayed even if
2135         * fast commit has now been turned off.
2136         */
2137        journal->j_fc_replay_callback = ext4_fc_replay;
2138        if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2139                return;
2140        journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2141}
2142
2143static const char *fc_ineligible_reasons[] = {
2144        "Extended attributes changed",
2145        "Cross rename",
2146        "Journal flag changed",
2147        "Insufficient memory",
2148        "Swap boot",
2149        "Resize",
2150        "Dir renamed",
2151        "Falloc range op",
2152        "Data journalling",
2153        "FC Commit Failed"
2154};
2155
2156int ext4_fc_info_show(struct seq_file *seq, void *v)
2157{
2158        struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2159        struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2160        int i;
2161
2162        if (v != SEQ_START_TOKEN)
2163                return 0;
2164
2165        seq_printf(seq,
2166                "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2167                   stats->fc_num_commits, stats->fc_ineligible_commits,
2168                   stats->fc_numblks,
2169                   div_u64(sbi->s_fc_avg_commit_time, 1000));
2170        seq_puts(seq, "Ineligible reasons:\n");
2171        for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2172                seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2173                        stats->fc_ineligible_reason_count[i]);
2174
2175        return 0;
2176}
2177
2178int __init ext4_fc_init_dentry_cache(void)
2179{
2180        ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2181                                           SLAB_RECLAIM_ACCOUNT);
2182
2183        if (ext4_fc_dentry_cachep == NULL)
2184                return -ENOMEM;
2185
2186        return 0;
2187}
2188