linux/fs/ext4/fast_commit.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2
   3/*
   4 * fs/ext4/fast_commit.c
   5 *
   6 * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
   7 *
   8 * Ext4 fast commits routines.
   9 */
  10#include "ext4.h"
  11#include "ext4_jbd2.h"
  12#include "ext4_extents.h"
  13#include "mballoc.h"
  14
  15/*
  16 * Ext4 Fast Commits
  17 * -----------------
  18 *
  19 * Ext4 fast commits implement fine grained journalling for Ext4.
  20 *
  21 * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
  22 * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
  23 * TLV during the recovery phase. For the scenarios for which we currently
  24 * don't have replay code, fast commit falls back to full commits.
  25 * Fast commits record delta in one of the following three categories.
  26 *
  27 * (A) Directory entry updates:
  28 *
  29 * - EXT4_FC_TAG_UNLINK         - records directory entry unlink
  30 * - EXT4_FC_TAG_LINK           - records directory entry link
  31 * - EXT4_FC_TAG_CREAT          - records inode and directory entry creation
  32 *
  33 * (B) File specific data range updates:
  34 *
  35 * - EXT4_FC_TAG_ADD_RANGE      - records addition of new blocks to an inode
  36 * - EXT4_FC_TAG_DEL_RANGE      - records deletion of blocks from an inode
  37 *
  38 * (C) Inode metadata (mtime / ctime etc):
  39 *
  40 * - EXT4_FC_TAG_INODE          - record the inode that should be replayed
  41 *                                during recovery. Note that iblocks field is
  42 *                                not replayed and instead derived during
  43 *                                replay.
  44 * Commit Operation
  45 * ----------------
  46 * With fast commits, we maintain all the directory entry operations in the
  47 * order in which they are issued in an in-memory queue. This queue is flushed
  48 * to disk during the commit operation. We also maintain a list of inodes
  49 * that need to be committed during a fast commit in another in memory queue of
  50 * inodes. During the commit operation, we commit in the following order:
  51 *
  52 * [1] Lock inodes for any further data updates by setting COMMITTING state
  53 * [2] Submit data buffers of all the inodes
  54 * [3] Wait for [2] to complete
  55 * [4] Commit all the directory entry updates in the fast commit space
  56 * [5] Commit all the changed inode structures
  57 * [6] Write tail tag (this tag ensures the atomicity, please read the following
  58 *     section for more details).
  59 * [7] Wait for [4], [5] and [6] to complete.
  60 *
  61 * All the inode updates must call ext4_fc_start_update() before starting an
  62 * update. If such an ongoing update is present, fast commit waits for it to
  63 * complete. The completion of such an update is marked by
  64 * ext4_fc_stop_update().
  65 *
  66 * Fast Commit Ineligibility
  67 * -------------------------
  68 * Not all operations are supported by fast commits today (e.g extended
  69 * attributes). Fast commit ineligibility is marked by calling one of the
  70 * two following functions:
  71 *
  72 * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall
  73 *   back to full commit. This is useful in case of transient errors.
  74 *
  75 * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all
  76 *   the fast commits happening between ext4_fc_start_ineligible() and
  77 *   ext4_fc_stop_ineligible() and one fast commit after the call to
  78 *   ext4_fc_stop_ineligible() to fall back to full commits. It is important to
  79 *   make one more fast commit to fall back to full commit after stop call so
  80 *   that it guaranteed that the fast commit ineligible operation contained
  81 *   within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is
  82 *   followed by at least 1 full commit.
  83 *
  84 * Atomicity of commits
  85 * --------------------
  86 * In order to guarantee atomicity during the commit operation, fast commit
  87 * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
  88 * tag contains CRC of the contents and TID of the transaction after which
  89 * this fast commit should be applied. Recovery code replays fast commit
  90 * logs only if there's at least 1 valid tail present. For every fast commit
  91 * operation, there is 1 tail. This means, we may end up with multiple tails
  92 * in the fast commit space. Here's an example:
  93 *
  94 * - Create a new file A and remove existing file B
  95 * - fsync()
  96 * - Append contents to file A
  97 * - Truncate file A
  98 * - fsync()
  99 *
 100 * The fast commit space at the end of above operations would look like this:
 101 *      [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
 102 *             |<---  Fast Commit 1   --->|<---      Fast Commit 2     ---->|
 103 *
 104 * Replay code should thus check for all the valid tails in the FC area.
 105 *
 106 * Fast Commit Replay Idempotence
 107 * ------------------------------
 108 *
 109 * Fast commits tags are idempotent in nature provided the recovery code follows
 110 * certain rules. The guiding principle that the commit path follows while
 111 * committing is that it stores the result of a particular operation instead of
 112 * storing the procedure.
 113 *
 114 * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
 115 * was associated with inode 10. During fast commit, instead of storing this
 116 * operation as a procedure "rename a to b", we store the resulting file system
 117 * state as a "series" of outcomes:
 118 *
 119 * - Link dirent b to inode 10
 120 * - Unlink dirent a
 121 * - Inode <10> with valid refcount
 122 *
 123 * Now when recovery code runs, it needs "enforce" this state on the file
 124 * system. This is what guarantees idempotence of fast commit replay.
 125 *
 126 * Let's take an example of a procedure that is not idempotent and see how fast
 127 * commits make it idempotent. Consider following sequence of operations:
 128 *
 129 *     rm A;    mv B A;    read A
 130 *  (x)     (y)        (z)
 131 *
 132 * (x), (y) and (z) are the points at which we can crash. If we store this
 133 * sequence of operations as is then the replay is not idempotent. Let's say
 134 * while in replay, we crash at (z). During the second replay, file A (which was
 135 * actually created as a result of "mv B A" operation) would get deleted. Thus,
 136 * file named A would be absent when we try to read A. So, this sequence of
 137 * operations is not idempotent. However, as mentioned above, instead of storing
 138 * the procedure fast commits store the outcome of each procedure. Thus the fast
 139 * commit log for above procedure would be as follows:
 140 *
 141 * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
 142 * inode 11 before the replay)
 143 *
 144 *    [Unlink A]   [Link A to inode 11]   [Unlink B]   [Inode 11]
 145 * (w)          (x)                    (y)          (z)
 146 *
 147 * If we crash at (z), we will have file A linked to inode 11. During the second
 148 * replay, we will remove file A (inode 11). But we will create it back and make
 149 * it point to inode 11. We won't find B, so we'll just skip that step. At this
 150 * point, the refcount for inode 11 is not reliable, but that gets fixed by the
 151 * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
 152 * similarly. Thus, by converting a non-idempotent procedure into a series of
 153 * idempotent outcomes, fast commits ensured idempotence during the replay.
 154 *
 155 * TODOs
 156 * -----
 157 *
 158 * 0) Fast commit replay path hardening: Fast commit replay code should use
 159 *    journal handles to make sure all the updates it does during the replay
 160 *    path are atomic. With that if we crash during fast commit replay, after
 161 *    trying to do recovery again, we will find a file system where fast commit
 162 *    area is invalid (because new full commit would be found). In order to deal
 163 *    with that, fast commit replay code should ensure that the "FC_REPLAY"
 164 *    superblock state is persisted before starting the replay, so that after
 165 *    the crash, fast commit recovery code can look at that flag and perform
 166 *    fast commit recovery even if that area is invalidated by later full
 167 *    commits.
 168 *
 169 * 1) Make fast commit atomic updates more fine grained. Today, a fast commit
 170 *    eligible update must be protected within ext4_fc_start_update() and
 171 *    ext4_fc_stop_update(). These routines are called at much higher
 172 *    routines. This can be made more fine grained by combining with
 173 *    ext4_journal_start().
 174 *
 175 * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible()
 176 *
 177 * 3) Handle more ineligible cases.
 178 */
 179
 180#include <trace/events/ext4.h>
 181static struct kmem_cache *ext4_fc_dentry_cachep;
 182
 183static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 184{
 185        BUFFER_TRACE(bh, "");
 186        if (uptodate) {
 187                ext4_debug("%s: Block %lld up-to-date",
 188                           __func__, bh->b_blocknr);
 189                set_buffer_uptodate(bh);
 190        } else {
 191                ext4_debug("%s: Block %lld not up-to-date",
 192                           __func__, bh->b_blocknr);
 193                clear_buffer_uptodate(bh);
 194        }
 195
 196        unlock_buffer(bh);
 197}
 198
 199static inline void ext4_fc_reset_inode(struct inode *inode)
 200{
 201        struct ext4_inode_info *ei = EXT4_I(inode);
 202
 203        ei->i_fc_lblk_start = 0;
 204        ei->i_fc_lblk_len = 0;
 205}
 206
 207void ext4_fc_init_inode(struct inode *inode)
 208{
 209        struct ext4_inode_info *ei = EXT4_I(inode);
 210
 211        ext4_fc_reset_inode(inode);
 212        ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
 213        INIT_LIST_HEAD(&ei->i_fc_list);
 214        init_waitqueue_head(&ei->i_fc_wait);
 215        atomic_set(&ei->i_fc_updates, 0);
 216}
 217
 218/* This function must be called with sbi->s_fc_lock held. */
 219static void ext4_fc_wait_committing_inode(struct inode *inode)
 220__releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
 221{
 222        wait_queue_head_t *wq;
 223        struct ext4_inode_info *ei = EXT4_I(inode);
 224
 225#if (BITS_PER_LONG < 64)
 226        DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
 227                        EXT4_STATE_FC_COMMITTING);
 228        wq = bit_waitqueue(&ei->i_state_flags,
 229                                EXT4_STATE_FC_COMMITTING);
 230#else
 231        DEFINE_WAIT_BIT(wait, &ei->i_flags,
 232                        EXT4_STATE_FC_COMMITTING);
 233        wq = bit_waitqueue(&ei->i_flags,
 234                                EXT4_STATE_FC_COMMITTING);
 235#endif
 236        lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
 237        prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
 238        spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 239        schedule();
 240        finish_wait(wq, &wait.wq_entry);
 241}
 242
 243/*
 244 * Inform Ext4's fast about start of an inode update
 245 *
 246 * This function is called by the high level call VFS callbacks before
 247 * performing any inode update. This function blocks if there's an ongoing
 248 * fast commit on the inode in question.
 249 */
 250void ext4_fc_start_update(struct inode *inode)
 251{
 252        struct ext4_inode_info *ei = EXT4_I(inode);
 253
 254        if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 255            (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 256                return;
 257
 258restart:
 259        spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 260        if (list_empty(&ei->i_fc_list))
 261                goto out;
 262
 263        if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
 264                ext4_fc_wait_committing_inode(inode);
 265                goto restart;
 266        }
 267out:
 268        atomic_inc(&ei->i_fc_updates);
 269        spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 270}
 271
 272/*
 273 * Stop inode update and wake up waiting fast commits if any.
 274 */
 275void ext4_fc_stop_update(struct inode *inode)
 276{
 277        struct ext4_inode_info *ei = EXT4_I(inode);
 278
 279        if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 280            (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 281                return;
 282
 283        if (atomic_dec_and_test(&ei->i_fc_updates))
 284                wake_up_all(&ei->i_fc_wait);
 285}
 286
 287/*
 288 * Remove inode from fast commit list. If the inode is being committed
 289 * we wait until inode commit is done.
 290 */
 291void ext4_fc_del(struct inode *inode)
 292{
 293        struct ext4_inode_info *ei = EXT4_I(inode);
 294
 295        if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 296            (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 297                return;
 298
 299restart:
 300        spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 301        if (list_empty(&ei->i_fc_list)) {
 302                spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 303                return;
 304        }
 305
 306        if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
 307                ext4_fc_wait_committing_inode(inode);
 308                goto restart;
 309        }
 310        list_del_init(&ei->i_fc_list);
 311        spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 312}
 313
 314/*
 315 * Mark file system as fast commit ineligible. This means that next commit
 316 * operation would result in a full jbd2 commit.
 317 */
 318void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
 319{
 320        struct ext4_sb_info *sbi = EXT4_SB(sb);
 321
 322        if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
 323            (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
 324                return;
 325
 326        ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
 327        WARN_ON(reason >= EXT4_FC_REASON_MAX);
 328        sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
 329}
 330
 331/*
 332 * Start a fast commit ineligible update. Any commits that happen while
 333 * such an operation is in progress fall back to full commits.
 334 */
 335void ext4_fc_start_ineligible(struct super_block *sb, int reason)
 336{
 337        struct ext4_sb_info *sbi = EXT4_SB(sb);
 338
 339        if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
 340            (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
 341                return;
 342
 343        WARN_ON(reason >= EXT4_FC_REASON_MAX);
 344        sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
 345        atomic_inc(&sbi->s_fc_ineligible_updates);
 346}
 347
 348/*
 349 * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here
 350 * to ensure that after stopping the ineligible update, at least one full
 351 * commit takes place.
 352 */
 353void ext4_fc_stop_ineligible(struct super_block *sb)
 354{
 355        if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
 356            (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
 357                return;
 358
 359        ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
 360        atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
 361}
 362
 363static inline int ext4_fc_is_ineligible(struct super_block *sb)
 364{
 365        return (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE) ||
 366                atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates));
 367}
 368
 369/*
 370 * Generic fast commit tracking function. If this is the first time this we are
 371 * called after a full commit, we initialize fast commit fields and then call
 372 * __fc_track_fn() with update = 0. If we have already been called after a full
 373 * commit, we pass update = 1. Based on that, the track function can determine
 374 * if it needs to track a field for the first time or if it needs to just
 375 * update the previously tracked value.
 376 *
 377 * If enqueue is set, this function enqueues the inode in fast commit list.
 378 */
 379static int ext4_fc_track_template(
 380        handle_t *handle, struct inode *inode,
 381        int (*__fc_track_fn)(struct inode *, void *, bool),
 382        void *args, int enqueue)
 383{
 384        bool update = false;
 385        struct ext4_inode_info *ei = EXT4_I(inode);
 386        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 387        tid_t tid = 0;
 388        int ret;
 389
 390        if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 391            (sbi->s_mount_state & EXT4_FC_REPLAY))
 392                return -EOPNOTSUPP;
 393
 394        if (ext4_fc_is_ineligible(inode->i_sb))
 395                return -EINVAL;
 396
 397        tid = handle->h_transaction->t_tid;
 398        mutex_lock(&ei->i_fc_lock);
 399        if (tid == ei->i_sync_tid) {
 400                update = true;
 401        } else {
 402                ext4_fc_reset_inode(inode);
 403                ei->i_sync_tid = tid;
 404        }
 405        ret = __fc_track_fn(inode, args, update);
 406        mutex_unlock(&ei->i_fc_lock);
 407
 408        if (!enqueue)
 409                return ret;
 410
 411        spin_lock(&sbi->s_fc_lock);
 412        if (list_empty(&EXT4_I(inode)->i_fc_list))
 413                list_add_tail(&EXT4_I(inode)->i_fc_list,
 414                                (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ?
 415                                &sbi->s_fc_q[FC_Q_STAGING] :
 416                                &sbi->s_fc_q[FC_Q_MAIN]);
 417        spin_unlock(&sbi->s_fc_lock);
 418
 419        return ret;
 420}
 421
 422struct __track_dentry_update_args {
 423        struct dentry *dentry;
 424        int op;
 425};
 426
 427/* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
 428static int __track_dentry_update(struct inode *inode, void *arg, bool update)
 429{
 430        struct ext4_fc_dentry_update *node;
 431        struct ext4_inode_info *ei = EXT4_I(inode);
 432        struct __track_dentry_update_args *dentry_update =
 433                (struct __track_dentry_update_args *)arg;
 434        struct dentry *dentry = dentry_update->dentry;
 435        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 436
 437        mutex_unlock(&ei->i_fc_lock);
 438        node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
 439        if (!node) {
 440                ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
 441                mutex_lock(&ei->i_fc_lock);
 442                return -ENOMEM;
 443        }
 444
 445        node->fcd_op = dentry_update->op;
 446        node->fcd_parent = dentry->d_parent->d_inode->i_ino;
 447        node->fcd_ino = inode->i_ino;
 448        if (dentry->d_name.len > DNAME_INLINE_LEN) {
 449                node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
 450                if (!node->fcd_name.name) {
 451                        kmem_cache_free(ext4_fc_dentry_cachep, node);
 452                        ext4_fc_mark_ineligible(inode->i_sb,
 453                                EXT4_FC_REASON_NOMEM);
 454                        mutex_lock(&ei->i_fc_lock);
 455                        return -ENOMEM;
 456                }
 457                memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
 458                        dentry->d_name.len);
 459        } else {
 460                memcpy(node->fcd_iname, dentry->d_name.name,
 461                        dentry->d_name.len);
 462                node->fcd_name.name = node->fcd_iname;
 463        }
 464        node->fcd_name.len = dentry->d_name.len;
 465
 466        spin_lock(&sbi->s_fc_lock);
 467        if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING))
 468                list_add_tail(&node->fcd_list,
 469                                &sbi->s_fc_dentry_q[FC_Q_STAGING]);
 470        else
 471                list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
 472        spin_unlock(&sbi->s_fc_lock);
 473        mutex_lock(&ei->i_fc_lock);
 474
 475        return 0;
 476}
 477
 478void __ext4_fc_track_unlink(handle_t *handle,
 479                struct inode *inode, struct dentry *dentry)
 480{
 481        struct __track_dentry_update_args args;
 482        int ret;
 483
 484        args.dentry = dentry;
 485        args.op = EXT4_FC_TAG_UNLINK;
 486
 487        ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 488                                        (void *)&args, 0);
 489        trace_ext4_fc_track_unlink(inode, dentry, ret);
 490}
 491
 492void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
 493{
 494        __ext4_fc_track_unlink(handle, d_inode(dentry), dentry);
 495}
 496
 497void __ext4_fc_track_link(handle_t *handle,
 498        struct inode *inode, struct dentry *dentry)
 499{
 500        struct __track_dentry_update_args args;
 501        int ret;
 502
 503        args.dentry = dentry;
 504        args.op = EXT4_FC_TAG_LINK;
 505
 506        ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 507                                        (void *)&args, 0);
 508        trace_ext4_fc_track_link(inode, dentry, ret);
 509}
 510
 511void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
 512{
 513        __ext4_fc_track_link(handle, d_inode(dentry), dentry);
 514}
 515
 516void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
 517                          struct dentry *dentry)
 518{
 519        struct __track_dentry_update_args args;
 520        int ret;
 521
 522        args.dentry = dentry;
 523        args.op = EXT4_FC_TAG_CREAT;
 524
 525        ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 526                                        (void *)&args, 0);
 527        trace_ext4_fc_track_create(inode, dentry, ret);
 528}
 529
 530void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
 531{
 532        __ext4_fc_track_create(handle, d_inode(dentry), dentry);
 533}
 534
 535/* __track_fn for inode tracking */
 536static int __track_inode(struct inode *inode, void *arg, bool update)
 537{
 538        if (update)
 539                return -EEXIST;
 540
 541        EXT4_I(inode)->i_fc_lblk_len = 0;
 542
 543        return 0;
 544}
 545
 546void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
 547{
 548        int ret;
 549
 550        if (S_ISDIR(inode->i_mode))
 551                return;
 552
 553        if (ext4_should_journal_data(inode)) {
 554                ext4_fc_mark_ineligible(inode->i_sb,
 555                                        EXT4_FC_REASON_INODE_JOURNAL_DATA);
 556                return;
 557        }
 558
 559        ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
 560        trace_ext4_fc_track_inode(inode, ret);
 561}
 562
 563struct __track_range_args {
 564        ext4_lblk_t start, end;
 565};
 566
 567/* __track_fn for tracking data updates */
 568static int __track_range(struct inode *inode, void *arg, bool update)
 569{
 570        struct ext4_inode_info *ei = EXT4_I(inode);
 571        ext4_lblk_t oldstart;
 572        struct __track_range_args *__arg =
 573                (struct __track_range_args *)arg;
 574
 575        if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
 576                ext4_debug("Special inode %ld being modified\n", inode->i_ino);
 577                return -ECANCELED;
 578        }
 579
 580        oldstart = ei->i_fc_lblk_start;
 581
 582        if (update && ei->i_fc_lblk_len > 0) {
 583                ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
 584                ei->i_fc_lblk_len =
 585                        max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
 586                                ei->i_fc_lblk_start + 1;
 587        } else {
 588                ei->i_fc_lblk_start = __arg->start;
 589                ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
 590        }
 591
 592        return 0;
 593}
 594
 595void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
 596                         ext4_lblk_t end)
 597{
 598        struct __track_range_args args;
 599        int ret;
 600
 601        if (S_ISDIR(inode->i_mode))
 602                return;
 603
 604        args.start = start;
 605        args.end = end;
 606
 607        ret = ext4_fc_track_template(handle, inode,  __track_range, &args, 1);
 608
 609        trace_ext4_fc_track_range(inode, start, end, ret);
 610}
 611
 612static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
 613{
 614        int write_flags = REQ_SYNC;
 615        struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
 616
 617        /* Add REQ_FUA | REQ_PREFLUSH only its tail */
 618        if (test_opt(sb, BARRIER) && is_tail)
 619                write_flags |= REQ_FUA | REQ_PREFLUSH;
 620        lock_buffer(bh);
 621        set_buffer_dirty(bh);
 622        set_buffer_uptodate(bh);
 623        bh->b_end_io = ext4_end_buffer_io_sync;
 624        submit_bh(REQ_OP_WRITE, write_flags, bh);
 625        EXT4_SB(sb)->s_fc_bh = NULL;
 626}
 627
 628/* Ext4 commit path routines */
 629
 630/* memzero and update CRC */
 631static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
 632                                u32 *crc)
 633{
 634        void *ret;
 635
 636        ret = memset(dst, 0, len);
 637        if (crc)
 638                *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
 639        return ret;
 640}
 641
 642/*
 643 * Allocate len bytes on a fast commit buffer.
 644 *
 645 * During the commit time this function is used to manage fast commit
 646 * block space. We don't split a fast commit log onto different
 647 * blocks. So this function makes sure that if there's not enough space
 648 * on the current block, the remaining space in the current block is
 649 * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
 650 * new block is from jbd2 and CRC is updated to reflect the padding
 651 * we added.
 652 */
 653static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
 654{
 655        struct ext4_fc_tl *tl;
 656        struct ext4_sb_info *sbi = EXT4_SB(sb);
 657        struct buffer_head *bh;
 658        int bsize = sbi->s_journal->j_blocksize;
 659        int ret, off = sbi->s_fc_bytes % bsize;
 660        int pad_len;
 661
 662        /*
 663         * After allocating len, we should have space at least for a 0 byte
 664         * padding.
 665         */
 666        if (len + sizeof(struct ext4_fc_tl) > bsize)
 667                return NULL;
 668
 669        if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
 670                /*
 671                 * Only allocate from current buffer if we have enough space for
 672                 * this request AND we have space to add a zero byte padding.
 673                 */
 674                if (!sbi->s_fc_bh) {
 675                        ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
 676                        if (ret)
 677                                return NULL;
 678                        sbi->s_fc_bh = bh;
 679                }
 680                sbi->s_fc_bytes += len;
 681                return sbi->s_fc_bh->b_data + off;
 682        }
 683        /* Need to add PAD tag */
 684        tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
 685        tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
 686        pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
 687        tl->fc_len = cpu_to_le16(pad_len);
 688        if (crc)
 689                *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
 690        if (pad_len > 0)
 691                ext4_fc_memzero(sb, tl + 1, pad_len, crc);
 692        ext4_fc_submit_bh(sb, false);
 693
 694        ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
 695        if (ret)
 696                return NULL;
 697        sbi->s_fc_bh = bh;
 698        sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
 699        return sbi->s_fc_bh->b_data;
 700}
 701
 702/* memcpy to fc reserved space and update CRC */
 703static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
 704                                int len, u32 *crc)
 705{
 706        if (crc)
 707                *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
 708        return memcpy(dst, src, len);
 709}
 710
 711/*
 712 * Complete a fast commit by writing tail tag.
 713 *
 714 * Writing tail tag marks the end of a fast commit. In order to guarantee
 715 * atomicity, after writing tail tag, even if there's space remaining
 716 * in the block, next commit shouldn't use it. That's why tail tag
 717 * has the length as that of the remaining space on the block.
 718 */
 719static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
 720{
 721        struct ext4_sb_info *sbi = EXT4_SB(sb);
 722        struct ext4_fc_tl tl;
 723        struct ext4_fc_tail tail;
 724        int off, bsize = sbi->s_journal->j_blocksize;
 725        u8 *dst;
 726
 727        /*
 728         * ext4_fc_reserve_space takes care of allocating an extra block if
 729         * there's no enough space on this block for accommodating this tail.
 730         */
 731        dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
 732        if (!dst)
 733                return -ENOSPC;
 734
 735        off = sbi->s_fc_bytes % bsize;
 736
 737        tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
 738        tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
 739        sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
 740
 741        ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
 742        dst += sizeof(tl);
 743        tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
 744        ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
 745        dst += sizeof(tail.fc_tid);
 746        tail.fc_crc = cpu_to_le32(crc);
 747        ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
 748
 749        ext4_fc_submit_bh(sb, true);
 750
 751        return 0;
 752}
 753
 754/*
 755 * Adds tag, length, value and updates CRC. Returns true if tlv was added.
 756 * Returns false if there's not enough space.
 757 */
 758static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
 759                           u32 *crc)
 760{
 761        struct ext4_fc_tl tl;
 762        u8 *dst;
 763
 764        dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
 765        if (!dst)
 766                return false;
 767
 768        tl.fc_tag = cpu_to_le16(tag);
 769        tl.fc_len = cpu_to_le16(len);
 770
 771        ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
 772        ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
 773
 774        return true;
 775}
 776
 777/* Same as above, but adds dentry tlv. */
 778static  bool ext4_fc_add_dentry_tlv(struct super_block *sb, u16 tag,
 779                                        int parent_ino, int ino, int dlen,
 780                                        const unsigned char *dname,
 781                                        u32 *crc)
 782{
 783        struct ext4_fc_dentry_info fcd;
 784        struct ext4_fc_tl tl;
 785        u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
 786                                        crc);
 787
 788        if (!dst)
 789                return false;
 790
 791        fcd.fc_parent_ino = cpu_to_le32(parent_ino);
 792        fcd.fc_ino = cpu_to_le32(ino);
 793        tl.fc_tag = cpu_to_le16(tag);
 794        tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
 795        ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
 796        dst += sizeof(tl);
 797        ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
 798        dst += sizeof(fcd);
 799        ext4_fc_memcpy(sb, dst, dname, dlen, crc);
 800        dst += dlen;
 801
 802        return true;
 803}
 804
 805/*
 806 * Writes inode in the fast commit space under TLV with tag @tag.
 807 * Returns 0 on success, error on failure.
 808 */
 809static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
 810{
 811        struct ext4_inode_info *ei = EXT4_I(inode);
 812        int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
 813        int ret;
 814        struct ext4_iloc iloc;
 815        struct ext4_fc_inode fc_inode;
 816        struct ext4_fc_tl tl;
 817        u8 *dst;
 818
 819        ret = ext4_get_inode_loc(inode, &iloc);
 820        if (ret)
 821                return ret;
 822
 823        if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
 824                inode_len += ei->i_extra_isize;
 825
 826        fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
 827        tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
 828        tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
 829
 830        dst = ext4_fc_reserve_space(inode->i_sb,
 831                        sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
 832        if (!dst)
 833                return -ECANCELED;
 834
 835        if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
 836                return -ECANCELED;
 837        dst += sizeof(tl);
 838        if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
 839                return -ECANCELED;
 840        dst += sizeof(fc_inode);
 841        if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
 842                                        inode_len, crc))
 843                return -ECANCELED;
 844
 845        return 0;
 846}
 847
 848/*
 849 * Writes updated data ranges for the inode in question. Updates CRC.
 850 * Returns 0 on success, error otherwise.
 851 */
 852static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
 853{
 854        ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
 855        struct ext4_inode_info *ei = EXT4_I(inode);
 856        struct ext4_map_blocks map;
 857        struct ext4_fc_add_range fc_ext;
 858        struct ext4_fc_del_range lrange;
 859        struct ext4_extent *ex;
 860        int ret;
 861
 862        mutex_lock(&ei->i_fc_lock);
 863        if (ei->i_fc_lblk_len == 0) {
 864                mutex_unlock(&ei->i_fc_lock);
 865                return 0;
 866        }
 867        old_blk_size = ei->i_fc_lblk_start;
 868        new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
 869        ei->i_fc_lblk_len = 0;
 870        mutex_unlock(&ei->i_fc_lock);
 871
 872        cur_lblk_off = old_blk_size;
 873        jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
 874                  __func__, cur_lblk_off, new_blk_size, inode->i_ino);
 875
 876        while (cur_lblk_off <= new_blk_size) {
 877                map.m_lblk = cur_lblk_off;
 878                map.m_len = new_blk_size - cur_lblk_off + 1;
 879                ret = ext4_map_blocks(NULL, inode, &map, 0);
 880                if (ret < 0)
 881                        return -ECANCELED;
 882
 883                if (map.m_len == 0) {
 884                        cur_lblk_off++;
 885                        continue;
 886                }
 887
 888                if (ret == 0) {
 889                        lrange.fc_ino = cpu_to_le32(inode->i_ino);
 890                        lrange.fc_lblk = cpu_to_le32(map.m_lblk);
 891                        lrange.fc_len = cpu_to_le32(map.m_len);
 892                        if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
 893                                            sizeof(lrange), (u8 *)&lrange, crc))
 894                                return -ENOSPC;
 895                } else {
 896                        fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
 897                        ex = (struct ext4_extent *)&fc_ext.fc_ex;
 898                        ex->ee_block = cpu_to_le32(map.m_lblk);
 899                        ex->ee_len = cpu_to_le16(map.m_len);
 900                        ext4_ext_store_pblock(ex, map.m_pblk);
 901                        if (map.m_flags & EXT4_MAP_UNWRITTEN)
 902                                ext4_ext_mark_unwritten(ex);
 903                        else
 904                                ext4_ext_mark_initialized(ex);
 905                        if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
 906                                            sizeof(fc_ext), (u8 *)&fc_ext, crc))
 907                                return -ENOSPC;
 908                }
 909
 910                cur_lblk_off += map.m_len;
 911        }
 912
 913        return 0;
 914}
 915
 916
 917/* Submit data for all the fast commit inodes */
 918static int ext4_fc_submit_inode_data_all(journal_t *journal)
 919{
 920        struct super_block *sb = (struct super_block *)(journal->j_private);
 921        struct ext4_sb_info *sbi = EXT4_SB(sb);
 922        struct ext4_inode_info *ei;
 923        int ret = 0;
 924
 925        spin_lock(&sbi->s_fc_lock);
 926        ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING);
 927        list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
 928                ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
 929                while (atomic_read(&ei->i_fc_updates)) {
 930                        DEFINE_WAIT(wait);
 931
 932                        prepare_to_wait(&ei->i_fc_wait, &wait,
 933                                                TASK_UNINTERRUPTIBLE);
 934                        if (atomic_read(&ei->i_fc_updates)) {
 935                                spin_unlock(&sbi->s_fc_lock);
 936                                schedule();
 937                                spin_lock(&sbi->s_fc_lock);
 938                        }
 939                        finish_wait(&ei->i_fc_wait, &wait);
 940                }
 941                spin_unlock(&sbi->s_fc_lock);
 942                ret = jbd2_submit_inode_data(ei->jinode);
 943                if (ret)
 944                        return ret;
 945                spin_lock(&sbi->s_fc_lock);
 946        }
 947        spin_unlock(&sbi->s_fc_lock);
 948
 949        return ret;
 950}
 951
 952/* Wait for completion of data for all the fast commit inodes */
 953static int ext4_fc_wait_inode_data_all(journal_t *journal)
 954{
 955        struct super_block *sb = (struct super_block *)(journal->j_private);
 956        struct ext4_sb_info *sbi = EXT4_SB(sb);
 957        struct ext4_inode_info *pos, *n;
 958        int ret = 0;
 959
 960        spin_lock(&sbi->s_fc_lock);
 961        list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
 962                if (!ext4_test_inode_state(&pos->vfs_inode,
 963                                           EXT4_STATE_FC_COMMITTING))
 964                        continue;
 965                spin_unlock(&sbi->s_fc_lock);
 966
 967                ret = jbd2_wait_inode_data(journal, pos->jinode);
 968                if (ret)
 969                        return ret;
 970                spin_lock(&sbi->s_fc_lock);
 971        }
 972        spin_unlock(&sbi->s_fc_lock);
 973
 974        return 0;
 975}
 976
 977/* Commit all the directory entry updates */
 978static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
 979__acquires(&sbi->s_fc_lock)
 980__releases(&sbi->s_fc_lock)
 981{
 982        struct super_block *sb = (struct super_block *)(journal->j_private);
 983        struct ext4_sb_info *sbi = EXT4_SB(sb);
 984        struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n;
 985        struct inode *inode;
 986        struct ext4_inode_info *ei, *ei_n;
 987        int ret;
 988
 989        if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
 990                return 0;
 991        list_for_each_entry_safe(fc_dentry, fc_dentry_n,
 992                                 &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
 993                if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
 994                        spin_unlock(&sbi->s_fc_lock);
 995                        if (!ext4_fc_add_dentry_tlv(
 996                                sb, fc_dentry->fcd_op,
 997                                fc_dentry->fcd_parent, fc_dentry->fcd_ino,
 998                                fc_dentry->fcd_name.len,
 999                                fc_dentry->fcd_name.name, crc)) {
1000                                ret = -ENOSPC;
1001                                goto lock_and_exit;
1002                        }
1003                        spin_lock(&sbi->s_fc_lock);
1004                        continue;
1005                }
1006
1007                inode = NULL;
1008                list_for_each_entry_safe(ei, ei_n, &sbi->s_fc_q[FC_Q_MAIN],
1009                                         i_fc_list) {
1010                        if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
1011                                inode = &ei->vfs_inode;
1012                                break;
1013                        }
1014                }
1015                /*
1016                 * If we don't find inode in our list, then it was deleted,
1017                 * in which case, we don't need to record it's create tag.
1018                 */
1019                if (!inode)
1020                        continue;
1021                spin_unlock(&sbi->s_fc_lock);
1022
1023                /*
1024                 * We first write the inode and then the create dirent. This
1025                 * allows the recovery code to create an unnamed inode first
1026                 * and then link it to a directory entry. This allows us
1027                 * to use namei.c routines almost as is and simplifies
1028                 * the recovery code.
1029                 */
1030                ret = ext4_fc_write_inode(inode, crc);
1031                if (ret)
1032                        goto lock_and_exit;
1033
1034                ret = ext4_fc_write_inode_data(inode, crc);
1035                if (ret)
1036                        goto lock_and_exit;
1037
1038                if (!ext4_fc_add_dentry_tlv(
1039                        sb, fc_dentry->fcd_op,
1040                        fc_dentry->fcd_parent, fc_dentry->fcd_ino,
1041                        fc_dentry->fcd_name.len,
1042                        fc_dentry->fcd_name.name, crc)) {
1043                        ret = -ENOSPC;
1044                        goto lock_and_exit;
1045                }
1046
1047                spin_lock(&sbi->s_fc_lock);
1048        }
1049        return 0;
1050lock_and_exit:
1051        spin_lock(&sbi->s_fc_lock);
1052        return ret;
1053}
1054
1055static int ext4_fc_perform_commit(journal_t *journal)
1056{
1057        struct super_block *sb = (struct super_block *)(journal->j_private);
1058        struct ext4_sb_info *sbi = EXT4_SB(sb);
1059        struct ext4_inode_info *iter;
1060        struct ext4_fc_head head;
1061        struct inode *inode;
1062        struct blk_plug plug;
1063        int ret = 0;
1064        u32 crc = 0;
1065
1066        ret = ext4_fc_submit_inode_data_all(journal);
1067        if (ret)
1068                return ret;
1069
1070        ret = ext4_fc_wait_inode_data_all(journal);
1071        if (ret)
1072                return ret;
1073
1074        /*
1075         * If file system device is different from journal device, issue a cache
1076         * flush before we start writing fast commit blocks.
1077         */
1078        if (journal->j_fs_dev != journal->j_dev)
1079                blkdev_issue_flush(journal->j_fs_dev);
1080
1081        blk_start_plug(&plug);
1082        if (sbi->s_fc_bytes == 0) {
1083                /*
1084                 * Add a head tag only if this is the first fast commit
1085                 * in this TID.
1086                 */
1087                head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1088                head.fc_tid = cpu_to_le32(
1089                        sbi->s_journal->j_running_transaction->t_tid);
1090                if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1091                        (u8 *)&head, &crc)) {
1092                        ret = -ENOSPC;
1093                        goto out;
1094                }
1095        }
1096
1097        spin_lock(&sbi->s_fc_lock);
1098        ret = ext4_fc_commit_dentry_updates(journal, &crc);
1099        if (ret) {
1100                spin_unlock(&sbi->s_fc_lock);
1101                goto out;
1102        }
1103
1104        list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1105                inode = &iter->vfs_inode;
1106                if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1107                        continue;
1108
1109                spin_unlock(&sbi->s_fc_lock);
1110                ret = ext4_fc_write_inode_data(inode, &crc);
1111                if (ret)
1112                        goto out;
1113                ret = ext4_fc_write_inode(inode, &crc);
1114                if (ret)
1115                        goto out;
1116                spin_lock(&sbi->s_fc_lock);
1117        }
1118        spin_unlock(&sbi->s_fc_lock);
1119
1120        ret = ext4_fc_write_tail(sb, crc);
1121
1122out:
1123        blk_finish_plug(&plug);
1124        return ret;
1125}
1126
1127/*
1128 * The main commit entry point. Performs a fast commit for transaction
1129 * commit_tid if needed. If it's not possible to perform a fast commit
1130 * due to various reasons, we fall back to full commit. Returns 0
1131 * on success, error otherwise.
1132 */
1133int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1134{
1135        struct super_block *sb = (struct super_block *)(journal->j_private);
1136        struct ext4_sb_info *sbi = EXT4_SB(sb);
1137        int nblks = 0, ret, bsize = journal->j_blocksize;
1138        int subtid = atomic_read(&sbi->s_fc_subtid);
1139        int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0;
1140        ktime_t start_time, commit_time;
1141
1142        trace_ext4_fc_commit_start(sb);
1143
1144        start_time = ktime_get();
1145
1146        if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
1147                (ext4_fc_is_ineligible(sb))) {
1148                reason = EXT4_FC_REASON_INELIGIBLE;
1149                goto out;
1150        }
1151
1152restart_fc:
1153        ret = jbd2_fc_begin_commit(journal, commit_tid);
1154        if (ret == -EALREADY) {
1155                /* There was an ongoing commit, check if we need to restart */
1156                if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1157                        commit_tid > journal->j_commit_sequence)
1158                        goto restart_fc;
1159                reason = EXT4_FC_REASON_ALREADY_COMMITTED;
1160                goto out;
1161        } else if (ret) {
1162                sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1163                reason = EXT4_FC_REASON_FC_START_FAILED;
1164                goto out;
1165        }
1166
1167        fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1168        ret = ext4_fc_perform_commit(journal);
1169        if (ret < 0) {
1170                sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1171                reason = EXT4_FC_REASON_FC_FAILED;
1172                goto out;
1173        }
1174        nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1175        ret = jbd2_fc_wait_bufs(journal, nblks);
1176        if (ret < 0) {
1177                sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1178                reason = EXT4_FC_REASON_FC_FAILED;
1179                goto out;
1180        }
1181        atomic_inc(&sbi->s_fc_subtid);
1182        jbd2_fc_end_commit(journal);
1183out:
1184        /* Has any ineligible update happened since we started? */
1185        if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) {
1186                sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1187                reason = EXT4_FC_REASON_INELIGIBLE;
1188        }
1189
1190        spin_lock(&sbi->s_fc_lock);
1191        if (reason != EXT4_FC_REASON_OK &&
1192                reason != EXT4_FC_REASON_ALREADY_COMMITTED) {
1193                sbi->s_fc_stats.fc_ineligible_commits++;
1194        } else {
1195                sbi->s_fc_stats.fc_num_commits++;
1196                sbi->s_fc_stats.fc_numblks += nblks;
1197        }
1198        spin_unlock(&sbi->s_fc_lock);
1199        nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0;
1200        trace_ext4_fc_commit_stop(sb, nblks, reason);
1201        commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1202        /*
1203         * weight the commit time higher than the average time so we don't
1204         * react too strongly to vast changes in the commit time
1205         */
1206        if (likely(sbi->s_fc_avg_commit_time))
1207                sbi->s_fc_avg_commit_time = (commit_time +
1208                                sbi->s_fc_avg_commit_time * 3) / 4;
1209        else
1210                sbi->s_fc_avg_commit_time = commit_time;
1211        jbd_debug(1,
1212                "Fast commit ended with blks = %d, reason = %d, subtid - %d",
1213                nblks, reason, subtid);
1214        if (reason == EXT4_FC_REASON_FC_FAILED)
1215                return jbd2_fc_end_commit_fallback(journal);
1216        if (reason == EXT4_FC_REASON_FC_START_FAILED ||
1217                reason == EXT4_FC_REASON_INELIGIBLE)
1218                return jbd2_complete_transaction(journal, commit_tid);
1219        return 0;
1220}
1221
1222/*
1223 * Fast commit cleanup routine. This is called after every fast commit and
1224 * full commit. full is true if we are called after a full commit.
1225 */
1226static void ext4_fc_cleanup(journal_t *journal, int full)
1227{
1228        struct super_block *sb = journal->j_private;
1229        struct ext4_sb_info *sbi = EXT4_SB(sb);
1230        struct ext4_inode_info *iter, *iter_n;
1231        struct ext4_fc_dentry_update *fc_dentry;
1232
1233        if (full && sbi->s_fc_bh)
1234                sbi->s_fc_bh = NULL;
1235
1236        jbd2_fc_release_bufs(journal);
1237
1238        spin_lock(&sbi->s_fc_lock);
1239        list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN],
1240                                 i_fc_list) {
1241                list_del_init(&iter->i_fc_list);
1242                ext4_clear_inode_state(&iter->vfs_inode,
1243                                       EXT4_STATE_FC_COMMITTING);
1244                ext4_fc_reset_inode(&iter->vfs_inode);
1245                /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1246                smp_mb();
1247#if (BITS_PER_LONG < 64)
1248                wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1249#else
1250                wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1251#endif
1252        }
1253
1254        while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1255                fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1256                                             struct ext4_fc_dentry_update,
1257                                             fcd_list);
1258                list_del_init(&fc_dentry->fcd_list);
1259                spin_unlock(&sbi->s_fc_lock);
1260
1261                if (fc_dentry->fcd_name.name &&
1262                        fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1263                        kfree(fc_dentry->fcd_name.name);
1264                kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1265                spin_lock(&sbi->s_fc_lock);
1266        }
1267
1268        list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1269                                &sbi->s_fc_dentry_q[FC_Q_MAIN]);
1270        list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1271                                &sbi->s_fc_q[FC_Q_MAIN]);
1272
1273        ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING);
1274        ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
1275
1276        if (full)
1277                sbi->s_fc_bytes = 0;
1278        spin_unlock(&sbi->s_fc_lock);
1279        trace_ext4_fc_stats(sb);
1280}
1281
1282/* Ext4 Replay Path Routines */
1283
1284/* Helper struct for dentry replay routines */
1285struct dentry_info_args {
1286        int parent_ino, dname_len, ino, inode_len;
1287        char *dname;
1288};
1289
1290static inline void tl_to_darg(struct dentry_info_args *darg,
1291                              struct  ext4_fc_tl *tl, u8 *val)
1292{
1293        struct ext4_fc_dentry_info fcd;
1294
1295        memcpy(&fcd, val, sizeof(fcd));
1296
1297        darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
1298        darg->ino = le32_to_cpu(fcd.fc_ino);
1299        darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
1300        darg->dname_len = le16_to_cpu(tl->fc_len) -
1301                sizeof(struct ext4_fc_dentry_info);
1302}
1303
1304/* Unlink replay function */
1305static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl,
1306                                 u8 *val)
1307{
1308        struct inode *inode, *old_parent;
1309        struct qstr entry;
1310        struct dentry_info_args darg;
1311        int ret = 0;
1312
1313        tl_to_darg(&darg, tl, val);
1314
1315        trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1316                        darg.parent_ino, darg.dname_len);
1317
1318        entry.name = darg.dname;
1319        entry.len = darg.dname_len;
1320        inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1321
1322        if (IS_ERR(inode)) {
1323                jbd_debug(1, "Inode %d not found", darg.ino);
1324                return 0;
1325        }
1326
1327        old_parent = ext4_iget(sb, darg.parent_ino,
1328                                EXT4_IGET_NORMAL);
1329        if (IS_ERR(old_parent)) {
1330                jbd_debug(1, "Dir with inode  %d not found", darg.parent_ino);
1331                iput(inode);
1332                return 0;
1333        }
1334
1335        ret = __ext4_unlink(NULL, old_parent, &entry, inode);
1336        /* -ENOENT ok coz it might not exist anymore. */
1337        if (ret == -ENOENT)
1338                ret = 0;
1339        iput(old_parent);
1340        iput(inode);
1341        return ret;
1342}
1343
1344static int ext4_fc_replay_link_internal(struct super_block *sb,
1345                                struct dentry_info_args *darg,
1346                                struct inode *inode)
1347{
1348        struct inode *dir = NULL;
1349        struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1350        struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1351        int ret = 0;
1352
1353        dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1354        if (IS_ERR(dir)) {
1355                jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
1356                dir = NULL;
1357                goto out;
1358        }
1359
1360        dentry_dir = d_obtain_alias(dir);
1361        if (IS_ERR(dentry_dir)) {
1362                jbd_debug(1, "Failed to obtain dentry");
1363                dentry_dir = NULL;
1364                goto out;
1365        }
1366
1367        dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1368        if (!dentry_inode) {
1369                jbd_debug(1, "Inode dentry not created.");
1370                ret = -ENOMEM;
1371                goto out;
1372        }
1373
1374        ret = __ext4_link(dir, inode, dentry_inode);
1375        /*
1376         * It's possible that link already existed since data blocks
1377         * for the dir in question got persisted before we crashed OR
1378         * we replayed this tag and crashed before the entire replay
1379         * could complete.
1380         */
1381        if (ret && ret != -EEXIST) {
1382                jbd_debug(1, "Failed to link\n");
1383                goto out;
1384        }
1385
1386        ret = 0;
1387out:
1388        if (dentry_dir) {
1389                d_drop(dentry_dir);
1390                dput(dentry_dir);
1391        } else if (dir) {
1392                iput(dir);
1393        }
1394        if (dentry_inode) {
1395                d_drop(dentry_inode);
1396                dput(dentry_inode);
1397        }
1398
1399        return ret;
1400}
1401
1402/* Link replay function */
1403static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl,
1404                               u8 *val)
1405{
1406        struct inode *inode;
1407        struct dentry_info_args darg;
1408        int ret = 0;
1409
1410        tl_to_darg(&darg, tl, val);
1411        trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1412                        darg.parent_ino, darg.dname_len);
1413
1414        inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1415        if (IS_ERR(inode)) {
1416                jbd_debug(1, "Inode not found.");
1417                return 0;
1418        }
1419
1420        ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1421        iput(inode);
1422        return ret;
1423}
1424
1425/*
1426 * Record all the modified inodes during replay. We use this later to setup
1427 * block bitmaps correctly.
1428 */
1429static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1430{
1431        struct ext4_fc_replay_state *state;
1432        int i;
1433
1434        state = &EXT4_SB(sb)->s_fc_replay_state;
1435        for (i = 0; i < state->fc_modified_inodes_used; i++)
1436                if (state->fc_modified_inodes[i] == ino)
1437                        return 0;
1438        if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1439                state->fc_modified_inodes_size +=
1440                        EXT4_FC_REPLAY_REALLOC_INCREMENT;
1441                state->fc_modified_inodes = krealloc(
1442                                        state->fc_modified_inodes, sizeof(int) *
1443                                        state->fc_modified_inodes_size,
1444                                        GFP_KERNEL);
1445                if (!state->fc_modified_inodes)
1446                        return -ENOMEM;
1447        }
1448        state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1449        return 0;
1450}
1451
1452/*
1453 * Inode replay function
1454 */
1455static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
1456                                u8 *val)
1457{
1458        struct ext4_fc_inode fc_inode;
1459        struct ext4_inode *raw_inode;
1460        struct ext4_inode *raw_fc_inode;
1461        struct inode *inode = NULL;
1462        struct ext4_iloc iloc;
1463        int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
1464        struct ext4_extent_header *eh;
1465
1466        memcpy(&fc_inode, val, sizeof(fc_inode));
1467
1468        ino = le32_to_cpu(fc_inode.fc_ino);
1469        trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1470
1471        inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1472        if (!IS_ERR(inode)) {
1473                ext4_ext_clear_bb(inode);
1474                iput(inode);
1475        }
1476        inode = NULL;
1477
1478        ext4_fc_record_modified_inode(sb, ino);
1479
1480        raw_fc_inode = (struct ext4_inode *)
1481                (val + offsetof(struct ext4_fc_inode, fc_raw_inode));
1482        ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1483        if (ret)
1484                goto out;
1485
1486        inode_len = le16_to_cpu(tl->fc_len) - sizeof(struct ext4_fc_inode);
1487        raw_inode = ext4_raw_inode(&iloc);
1488
1489        memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1490        memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
1491                inode_len - offsetof(struct ext4_inode, i_generation));
1492        if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1493                eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1494                if (eh->eh_magic != EXT4_EXT_MAGIC) {
1495                        memset(eh, 0, sizeof(*eh));
1496                        eh->eh_magic = EXT4_EXT_MAGIC;
1497                        eh->eh_max = cpu_to_le16(
1498                                (sizeof(raw_inode->i_block) -
1499                                 sizeof(struct ext4_extent_header))
1500                                 / sizeof(struct ext4_extent));
1501                }
1502        } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1503                memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1504                        sizeof(raw_inode->i_block));
1505        }
1506
1507        /* Immediately update the inode on disk. */
1508        ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1509        if (ret)
1510                goto out;
1511        ret = sync_dirty_buffer(iloc.bh);
1512        if (ret)
1513                goto out;
1514        ret = ext4_mark_inode_used(sb, ino);
1515        if (ret)
1516                goto out;
1517
1518        /* Given that we just wrote the inode on disk, this SHOULD succeed. */
1519        inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1520        if (IS_ERR(inode)) {
1521                jbd_debug(1, "Inode not found.");
1522                return -EFSCORRUPTED;
1523        }
1524
1525        /*
1526         * Our allocator could have made different decisions than before
1527         * crashing. This should be fixed but until then, we calculate
1528         * the number of blocks the inode.
1529         */
1530        ext4_ext_replay_set_iblocks(inode);
1531
1532        inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1533        ext4_reset_inode_seed(inode);
1534
1535        ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1536        ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1537        sync_dirty_buffer(iloc.bh);
1538        brelse(iloc.bh);
1539out:
1540        iput(inode);
1541        if (!ret)
1542                blkdev_issue_flush(sb->s_bdev);
1543
1544        return 0;
1545}
1546
1547/*
1548 * Dentry create replay function.
1549 *
1550 * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1551 * inode for which we are trying to create a dentry here, should already have
1552 * been replayed before we start here.
1553 */
1554static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl,
1555                                 u8 *val)
1556{
1557        int ret = 0;
1558        struct inode *inode = NULL;
1559        struct inode *dir = NULL;
1560        struct dentry_info_args darg;
1561
1562        tl_to_darg(&darg, tl, val);
1563
1564        trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1565                        darg.parent_ino, darg.dname_len);
1566
1567        /* This takes care of update group descriptor and other metadata */
1568        ret = ext4_mark_inode_used(sb, darg.ino);
1569        if (ret)
1570                goto out;
1571
1572        inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1573        if (IS_ERR(inode)) {
1574                jbd_debug(1, "inode %d not found.", darg.ino);
1575                inode = NULL;
1576                ret = -EINVAL;
1577                goto out;
1578        }
1579
1580        if (S_ISDIR(inode->i_mode)) {
1581                /*
1582                 * If we are creating a directory, we need to make sure that the
1583                 * dot and dot dot dirents are setup properly.
1584                 */
1585                dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1586                if (IS_ERR(dir)) {
1587                        jbd_debug(1, "Dir %d not found.", darg.ino);
1588                        goto out;
1589                }
1590                ret = ext4_init_new_dir(NULL, dir, inode);
1591                iput(dir);
1592                if (ret) {
1593                        ret = 0;
1594                        goto out;
1595                }
1596        }
1597        ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1598        if (ret)
1599                goto out;
1600        set_nlink(inode, 1);
1601        ext4_mark_inode_dirty(NULL, inode);
1602out:
1603        if (inode)
1604                iput(inode);
1605        return ret;
1606}
1607
1608/*
1609 * Record physical disk regions which are in use as per fast commit area. Our
1610 * simple replay phase allocator excludes these regions from allocation.
1611 */
1612static int ext4_fc_record_regions(struct super_block *sb, int ino,
1613                ext4_lblk_t lblk, ext4_fsblk_t pblk, int len)
1614{
1615        struct ext4_fc_replay_state *state;
1616        struct ext4_fc_alloc_region *region;
1617
1618        state = &EXT4_SB(sb)->s_fc_replay_state;
1619        if (state->fc_regions_used == state->fc_regions_size) {
1620                state->fc_regions_size +=
1621                        EXT4_FC_REPLAY_REALLOC_INCREMENT;
1622                state->fc_regions = krealloc(
1623                                        state->fc_regions,
1624                                        state->fc_regions_size *
1625                                        sizeof(struct ext4_fc_alloc_region),
1626                                        GFP_KERNEL);
1627                if (!state->fc_regions)
1628                        return -ENOMEM;
1629        }
1630        region = &state->fc_regions[state->fc_regions_used++];
1631        region->ino = ino;
1632        region->lblk = lblk;
1633        region->pblk = pblk;
1634        region->len = len;
1635
1636        return 0;
1637}
1638
1639/* Replay add range tag */
1640static int ext4_fc_replay_add_range(struct super_block *sb,
1641                                    struct ext4_fc_tl *tl, u8 *val)
1642{
1643        struct ext4_fc_add_range fc_add_ex;
1644        struct ext4_extent newex, *ex;
1645        struct inode *inode;
1646        ext4_lblk_t start, cur;
1647        int remaining, len;
1648        ext4_fsblk_t start_pblk;
1649        struct ext4_map_blocks map;
1650        struct ext4_ext_path *path = NULL;
1651        int ret;
1652
1653        memcpy(&fc_add_ex, val, sizeof(fc_add_ex));
1654        ex = (struct ext4_extent *)&fc_add_ex.fc_ex;
1655
1656        trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1657                le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block),
1658                ext4_ext_get_actual_len(ex));
1659
1660        inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
1661        if (IS_ERR(inode)) {
1662                jbd_debug(1, "Inode not found.");
1663                return 0;
1664        }
1665
1666        ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1667
1668        start = le32_to_cpu(ex->ee_block);
1669        start_pblk = ext4_ext_pblock(ex);
1670        len = ext4_ext_get_actual_len(ex);
1671
1672        cur = start;
1673        remaining = len;
1674        jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1675                  start, start_pblk, len, ext4_ext_is_unwritten(ex),
1676                  inode->i_ino);
1677
1678        while (remaining > 0) {
1679                map.m_lblk = cur;
1680                map.m_len = remaining;
1681                map.m_pblk = 0;
1682                ret = ext4_map_blocks(NULL, inode, &map, 0);
1683
1684                if (ret < 0) {
1685                        iput(inode);
1686                        return 0;
1687                }
1688
1689                if (ret == 0) {
1690                        /* Range is not mapped */
1691                        path = ext4_find_extent(inode, cur, NULL, 0);
1692                        if (IS_ERR(path)) {
1693                                iput(inode);
1694                                return 0;
1695                        }
1696                        memset(&newex, 0, sizeof(newex));
1697                        newex.ee_block = cpu_to_le32(cur);
1698                        ext4_ext_store_pblock(
1699                                &newex, start_pblk + cur - start);
1700                        newex.ee_len = cpu_to_le16(map.m_len);
1701                        if (ext4_ext_is_unwritten(ex))
1702                                ext4_ext_mark_unwritten(&newex);
1703                        down_write(&EXT4_I(inode)->i_data_sem);
1704                        ret = ext4_ext_insert_extent(
1705                                NULL, inode, &path, &newex, 0);
1706                        up_write((&EXT4_I(inode)->i_data_sem));
1707                        ext4_ext_drop_refs(path);
1708                        kfree(path);
1709                        if (ret) {
1710                                iput(inode);
1711                                return 0;
1712                        }
1713                        goto next;
1714                }
1715
1716                if (start_pblk + cur - start != map.m_pblk) {
1717                        /*
1718                         * Logical to physical mapping changed. This can happen
1719                         * if this range was removed and then reallocated to
1720                         * map to new physical blocks during a fast commit.
1721                         */
1722                        ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1723                                        ext4_ext_is_unwritten(ex),
1724                                        start_pblk + cur - start);
1725                        if (ret) {
1726                                iput(inode);
1727                                return 0;
1728                        }
1729                        /*
1730                         * Mark the old blocks as free since they aren't used
1731                         * anymore. We maintain an array of all the modified
1732                         * inodes. In case these blocks are still used at either
1733                         * a different logical range in the same inode or in
1734                         * some different inode, we will mark them as allocated
1735                         * at the end of the FC replay using our array of
1736                         * modified inodes.
1737                         */
1738                        ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1739                        goto next;
1740                }
1741
1742                /* Range is mapped and needs a state change */
1743                jbd_debug(1, "Converting from %ld to %d %lld",
1744                                map.m_flags & EXT4_MAP_UNWRITTEN,
1745                        ext4_ext_is_unwritten(ex), map.m_pblk);
1746                ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1747                                        ext4_ext_is_unwritten(ex), map.m_pblk);
1748                if (ret) {
1749                        iput(inode);
1750                        return 0;
1751                }
1752                /*
1753                 * We may have split the extent tree while toggling the state.
1754                 * Try to shrink the extent tree now.
1755                 */
1756                ext4_ext_replay_shrink_inode(inode, start + len);
1757next:
1758                cur += map.m_len;
1759                remaining -= map.m_len;
1760        }
1761        ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1762                                        sb->s_blocksize_bits);
1763        iput(inode);
1764        return 0;
1765}
1766
1767/* Replay DEL_RANGE tag */
1768static int
1769ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl,
1770                         u8 *val)
1771{
1772        struct inode *inode;
1773        struct ext4_fc_del_range lrange;
1774        struct ext4_map_blocks map;
1775        ext4_lblk_t cur, remaining;
1776        int ret;
1777
1778        memcpy(&lrange, val, sizeof(lrange));
1779        cur = le32_to_cpu(lrange.fc_lblk);
1780        remaining = le32_to_cpu(lrange.fc_len);
1781
1782        trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1783                le32_to_cpu(lrange.fc_ino), cur, remaining);
1784
1785        inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
1786        if (IS_ERR(inode)) {
1787                jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange.fc_ino));
1788                return 0;
1789        }
1790
1791        ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1792
1793        jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
1794                        inode->i_ino, le32_to_cpu(lrange.fc_lblk),
1795                        le32_to_cpu(lrange.fc_len));
1796        while (remaining > 0) {
1797                map.m_lblk = cur;
1798                map.m_len = remaining;
1799
1800                ret = ext4_map_blocks(NULL, inode, &map, 0);
1801                if (ret < 0) {
1802                        iput(inode);
1803                        return 0;
1804                }
1805                if (ret > 0) {
1806                        remaining -= ret;
1807                        cur += ret;
1808                        ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1809                } else {
1810                        remaining -= map.m_len;
1811                        cur += map.m_len;
1812                }
1813        }
1814
1815        ret = ext4_punch_hole(inode,
1816                le32_to_cpu(lrange.fc_lblk) << sb->s_blocksize_bits,
1817                le32_to_cpu(lrange.fc_len) <<  sb->s_blocksize_bits);
1818        if (ret)
1819                jbd_debug(1, "ext4_punch_hole returned %d", ret);
1820        ext4_ext_replay_shrink_inode(inode,
1821                i_size_read(inode) >> sb->s_blocksize_bits);
1822        ext4_mark_inode_dirty(NULL, inode);
1823        iput(inode);
1824
1825        return 0;
1826}
1827
1828static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1829{
1830        struct ext4_fc_replay_state *state;
1831        struct inode *inode;
1832        struct ext4_ext_path *path = NULL;
1833        struct ext4_map_blocks map;
1834        int i, ret, j;
1835        ext4_lblk_t cur, end;
1836
1837        state = &EXT4_SB(sb)->s_fc_replay_state;
1838        for (i = 0; i < state->fc_modified_inodes_used; i++) {
1839                inode = ext4_iget(sb, state->fc_modified_inodes[i],
1840                        EXT4_IGET_NORMAL);
1841                if (IS_ERR(inode)) {
1842                        jbd_debug(1, "Inode %d not found.",
1843                                state->fc_modified_inodes[i]);
1844                        continue;
1845                }
1846                cur = 0;
1847                end = EXT_MAX_BLOCKS;
1848                while (cur < end) {
1849                        map.m_lblk = cur;
1850                        map.m_len = end - cur;
1851
1852                        ret = ext4_map_blocks(NULL, inode, &map, 0);
1853                        if (ret < 0)
1854                                break;
1855
1856                        if (ret > 0) {
1857                                path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1858                                if (!IS_ERR(path)) {
1859                                        for (j = 0; j < path->p_depth; j++)
1860                                                ext4_mb_mark_bb(inode->i_sb,
1861                                                        path[j].p_block, 1, 1);
1862                                        ext4_ext_drop_refs(path);
1863                                        kfree(path);
1864                                }
1865                                cur += ret;
1866                                ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1867                                                        map.m_len, 1);
1868                        } else {
1869                                cur = cur + (map.m_len ? map.m_len : 1);
1870                        }
1871                }
1872                iput(inode);
1873        }
1874}
1875
1876/*
1877 * Check if block is in excluded regions for block allocation. The simple
1878 * allocator that runs during replay phase is calls this function to see
1879 * if it is okay to use a block.
1880 */
1881bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1882{
1883        int i;
1884        struct ext4_fc_replay_state *state;
1885
1886        state = &EXT4_SB(sb)->s_fc_replay_state;
1887        for (i = 0; i < state->fc_regions_valid; i++) {
1888                if (state->fc_regions[i].ino == 0 ||
1889                        state->fc_regions[i].len == 0)
1890                        continue;
1891                if (blk >= state->fc_regions[i].pblk &&
1892                    blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
1893                        return true;
1894        }
1895        return false;
1896}
1897
1898/* Cleanup function called after replay */
1899void ext4_fc_replay_cleanup(struct super_block *sb)
1900{
1901        struct ext4_sb_info *sbi = EXT4_SB(sb);
1902
1903        sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1904        kfree(sbi->s_fc_replay_state.fc_regions);
1905        kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1906}
1907
1908/*
1909 * Recovery Scan phase handler
1910 *
1911 * This function is called during the scan phase and is responsible
1912 * for doing following things:
1913 * - Make sure the fast commit area has valid tags for replay
1914 * - Count number of tags that need to be replayed by the replay handler
1915 * - Verify CRC
1916 * - Create a list of excluded blocks for allocation during replay phase
1917 *
1918 * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1919 * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1920 * to indicate that scan has finished and JBD2 can now start replay phase.
1921 * It returns a negative error to indicate that there was an error. At the end
1922 * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1923 * to indicate the number of tags that need to replayed during the replay phase.
1924 */
1925static int ext4_fc_replay_scan(journal_t *journal,
1926                                struct buffer_head *bh, int off,
1927                                tid_t expected_tid)
1928{
1929        struct super_block *sb = journal->j_private;
1930        struct ext4_sb_info *sbi = EXT4_SB(sb);
1931        struct ext4_fc_replay_state *state;
1932        int ret = JBD2_FC_REPLAY_CONTINUE;
1933        struct ext4_fc_add_range ext;
1934        struct ext4_fc_tl tl;
1935        struct ext4_fc_tail tail;
1936        __u8 *start, *end, *cur, *val;
1937        struct ext4_fc_head head;
1938        struct ext4_extent *ex;
1939
1940        state = &sbi->s_fc_replay_state;
1941
1942        start = (u8 *)bh->b_data;
1943        end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
1944
1945        if (state->fc_replay_expected_off == 0) {
1946                state->fc_cur_tag = 0;
1947                state->fc_replay_num_tags = 0;
1948                state->fc_crc = 0;
1949                state->fc_regions = NULL;
1950                state->fc_regions_valid = state->fc_regions_used =
1951                        state->fc_regions_size = 0;
1952                /* Check if we can stop early */
1953                if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
1954                        != EXT4_FC_TAG_HEAD)
1955                        return 0;
1956        }
1957
1958        if (off != state->fc_replay_expected_off) {
1959                ret = -EFSCORRUPTED;
1960                goto out_err;
1961        }
1962
1963        state->fc_replay_expected_off++;
1964        for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
1965                memcpy(&tl, cur, sizeof(tl));
1966                val = cur + sizeof(tl);
1967                jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
1968                          tag2str(le16_to_cpu(tl.fc_tag)), bh->b_blocknr);
1969                switch (le16_to_cpu(tl.fc_tag)) {
1970                case EXT4_FC_TAG_ADD_RANGE:
1971                        memcpy(&ext, val, sizeof(ext));
1972                        ex = (struct ext4_extent *)&ext.fc_ex;
1973                        ret = ext4_fc_record_regions(sb,
1974                                le32_to_cpu(ext.fc_ino),
1975                                le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
1976                                ext4_ext_get_actual_len(ex));
1977                        if (ret < 0)
1978                                break;
1979                        ret = JBD2_FC_REPLAY_CONTINUE;
1980                        fallthrough;
1981                case EXT4_FC_TAG_DEL_RANGE:
1982                case EXT4_FC_TAG_LINK:
1983                case EXT4_FC_TAG_UNLINK:
1984                case EXT4_FC_TAG_CREAT:
1985                case EXT4_FC_TAG_INODE:
1986                case EXT4_FC_TAG_PAD:
1987                        state->fc_cur_tag++;
1988                        state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
1989                                        sizeof(tl) + le16_to_cpu(tl.fc_len));
1990                        break;
1991                case EXT4_FC_TAG_TAIL:
1992                        state->fc_cur_tag++;
1993                        memcpy(&tail, val, sizeof(tail));
1994                        state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
1995                                                sizeof(tl) +
1996                                                offsetof(struct ext4_fc_tail,
1997                                                fc_crc));
1998                        if (le32_to_cpu(tail.fc_tid) == expected_tid &&
1999                                le32_to_cpu(tail.fc_crc) == state->fc_crc) {
2000                                state->fc_replay_num_tags = state->fc_cur_tag;
2001                                state->fc_regions_valid =
2002                                        state->fc_regions_used;
2003                        } else {
2004                                ret = state->fc_replay_num_tags ?
2005                                        JBD2_FC_REPLAY_STOP : -EFSBADCRC;
2006                        }
2007                        state->fc_crc = 0;
2008                        break;
2009                case EXT4_FC_TAG_HEAD:
2010                        memcpy(&head, val, sizeof(head));
2011                        if (le32_to_cpu(head.fc_features) &
2012                                ~EXT4_FC_SUPPORTED_FEATURES) {
2013                                ret = -EOPNOTSUPP;
2014                                break;
2015                        }
2016                        if (le32_to_cpu(head.fc_tid) != expected_tid) {
2017                                ret = JBD2_FC_REPLAY_STOP;
2018                                break;
2019                        }
2020                        state->fc_cur_tag++;
2021                        state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2022                                            sizeof(tl) + le16_to_cpu(tl.fc_len));
2023                        break;
2024                default:
2025                        ret = state->fc_replay_num_tags ?
2026                                JBD2_FC_REPLAY_STOP : -ECANCELED;
2027                }
2028                if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
2029                        break;
2030        }
2031
2032out_err:
2033        trace_ext4_fc_replay_scan(sb, ret, off);
2034        return ret;
2035}
2036
2037/*
2038 * Main recovery path entry point.
2039 * The meaning of return codes is similar as above.
2040 */
2041static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2042                                enum passtype pass, int off, tid_t expected_tid)
2043{
2044        struct super_block *sb = journal->j_private;
2045        struct ext4_sb_info *sbi = EXT4_SB(sb);
2046        struct ext4_fc_tl tl;
2047        __u8 *start, *end, *cur, *val;
2048        int ret = JBD2_FC_REPLAY_CONTINUE;
2049        struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2050        struct ext4_fc_tail tail;
2051
2052        if (pass == PASS_SCAN) {
2053                state->fc_current_pass = PASS_SCAN;
2054                return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2055        }
2056
2057        if (state->fc_current_pass != pass) {
2058                state->fc_current_pass = pass;
2059                sbi->s_mount_state |= EXT4_FC_REPLAY;
2060        }
2061        if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2062                jbd_debug(1, "Replay stops\n");
2063                ext4_fc_set_bitmaps_and_counters(sb);
2064                return 0;
2065        }
2066
2067#ifdef CONFIG_EXT4_DEBUG
2068        if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2069                pr_warn("Dropping fc block %d because max_replay set\n", off);
2070                return JBD2_FC_REPLAY_STOP;
2071        }
2072#endif
2073
2074        start = (u8 *)bh->b_data;
2075        end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2076
2077        for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
2078                memcpy(&tl, cur, sizeof(tl));
2079                val = cur + sizeof(tl);
2080
2081                if (state->fc_replay_num_tags == 0) {
2082                        ret = JBD2_FC_REPLAY_STOP;
2083                        ext4_fc_set_bitmaps_and_counters(sb);
2084                        break;
2085                }
2086                jbd_debug(3, "Replay phase, tag:%s\n",
2087                                tag2str(le16_to_cpu(tl.fc_tag)));
2088                state->fc_replay_num_tags--;
2089                switch (le16_to_cpu(tl.fc_tag)) {
2090                case EXT4_FC_TAG_LINK:
2091                        ret = ext4_fc_replay_link(sb, &tl, val);
2092                        break;
2093                case EXT4_FC_TAG_UNLINK:
2094                        ret = ext4_fc_replay_unlink(sb, &tl, val);
2095                        break;
2096                case EXT4_FC_TAG_ADD_RANGE:
2097                        ret = ext4_fc_replay_add_range(sb, &tl, val);
2098                        break;
2099                case EXT4_FC_TAG_CREAT:
2100                        ret = ext4_fc_replay_create(sb, &tl, val);
2101                        break;
2102                case EXT4_FC_TAG_DEL_RANGE:
2103                        ret = ext4_fc_replay_del_range(sb, &tl, val);
2104                        break;
2105                case EXT4_FC_TAG_INODE:
2106                        ret = ext4_fc_replay_inode(sb, &tl, val);
2107                        break;
2108                case EXT4_FC_TAG_PAD:
2109                        trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2110                                             le16_to_cpu(tl.fc_len), 0);
2111                        break;
2112                case EXT4_FC_TAG_TAIL:
2113                        trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
2114                                             le16_to_cpu(tl.fc_len), 0);
2115                        memcpy(&tail, val, sizeof(tail));
2116                        WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
2117                        break;
2118                case EXT4_FC_TAG_HEAD:
2119                        break;
2120                default:
2121                        trace_ext4_fc_replay(sb, le16_to_cpu(tl.fc_tag), 0,
2122                                             le16_to_cpu(tl.fc_len), 0);
2123                        ret = -ECANCELED;
2124                        break;
2125                }
2126                if (ret < 0)
2127                        break;
2128                ret = JBD2_FC_REPLAY_CONTINUE;
2129        }
2130        return ret;
2131}
2132
2133void ext4_fc_init(struct super_block *sb, journal_t *journal)
2134{
2135        /*
2136         * We set replay callback even if fast commit disabled because we may
2137         * could still have fast commit blocks that need to be replayed even if
2138         * fast commit has now been turned off.
2139         */
2140        journal->j_fc_replay_callback = ext4_fc_replay;
2141        if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2142                return;
2143        journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2144}
2145
2146static const char *fc_ineligible_reasons[] = {
2147        "Extended attributes changed",
2148        "Cross rename",
2149        "Journal flag changed",
2150        "Insufficient memory",
2151        "Swap boot",
2152        "Resize",
2153        "Dir renamed",
2154        "Falloc range op",
2155        "Data journalling",
2156        "FC Commit Failed"
2157};
2158
2159int ext4_fc_info_show(struct seq_file *seq, void *v)
2160{
2161        struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2162        struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2163        int i;
2164
2165        if (v != SEQ_START_TOKEN)
2166                return 0;
2167
2168        seq_printf(seq,
2169                "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2170                   stats->fc_num_commits, stats->fc_ineligible_commits,
2171                   stats->fc_numblks,
2172                   div_u64(sbi->s_fc_avg_commit_time, 1000));
2173        seq_puts(seq, "Ineligible reasons:\n");
2174        for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2175                seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2176                        stats->fc_ineligible_reason_count[i]);
2177
2178        return 0;
2179}
2180
2181int __init ext4_fc_init_dentry_cache(void)
2182{
2183        ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2184                                           SLAB_RECLAIM_ACCOUNT);
2185
2186        if (ext4_fc_dentry_cachep == NULL)
2187                return -ENOMEM;
2188
2189        return 0;
2190}
2191