linux/fs/ext4/fast_commit.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2
   3/*
   4 * fs/ext4/fast_commit.c
   5 *
   6 * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
   7 *
   8 * Ext4 fast commits routines.
   9 */
  10#include "ext4.h"
  11#include "ext4_jbd2.h"
  12#include "ext4_extents.h"
  13#include "mballoc.h"
  14
  15/*
  16 * Ext4 Fast Commits
  17 * -----------------
  18 *
  19 * Ext4 fast commits implement fine grained journalling for Ext4.
  20 *
  21 * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
  22 * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
  23 * TLV during the recovery phase. For the scenarios for which we currently
  24 * don't have replay code, fast commit falls back to full commits.
  25 * Fast commits record delta in one of the following three categories.
  26 *
  27 * (A) Directory entry updates:
  28 *
  29 * - EXT4_FC_TAG_UNLINK         - records directory entry unlink
  30 * - EXT4_FC_TAG_LINK           - records directory entry link
  31 * - EXT4_FC_TAG_CREAT          - records inode and directory entry creation
  32 *
  33 * (B) File specific data range updates:
  34 *
  35 * - EXT4_FC_TAG_ADD_RANGE      - records addition of new blocks to an inode
  36 * - EXT4_FC_TAG_DEL_RANGE      - records deletion of blocks from an inode
  37 *
  38 * (C) Inode metadata (mtime / ctime etc):
  39 *
  40 * - EXT4_FC_TAG_INODE          - record the inode that should be replayed
  41 *                                during recovery. Note that iblocks field is
  42 *                                not replayed and instead derived during
  43 *                                replay.
  44 * Commit Operation
  45 * ----------------
  46 * With fast commits, we maintain all the directory entry operations in the
  47 * order in which they are issued in an in-memory queue. This queue is flushed
  48 * to disk during the commit operation. We also maintain a list of inodes
  49 * that need to be committed during a fast commit in another in memory queue of
  50 * inodes. During the commit operation, we commit in the following order:
  51 *
  52 * [1] Lock inodes for any further data updates by setting COMMITTING state
  53 * [2] Submit data buffers of all the inodes
  54 * [3] Wait for [2] to complete
  55 * [4] Commit all the directory entry updates in the fast commit space
  56 * [5] Commit all the changed inode structures
  57 * [6] Write tail tag (this tag ensures the atomicity, please read the following
  58 *     section for more details).
  59 * [7] Wait for [4], [5] and [6] to complete.
  60 *
  61 * All the inode updates must call ext4_fc_start_update() before starting an
  62 * update. If such an ongoing update is present, fast commit waits for it to
  63 * complete. The completion of such an update is marked by
  64 * ext4_fc_stop_update().
  65 *
  66 * Fast Commit Ineligibility
  67 * -------------------------
  68 * Not all operations are supported by fast commits today (e.g extended
  69 * attributes). Fast commit ineligibility is marked by calling one of the
  70 * two following functions:
  71 *
  72 * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall
  73 *   back to full commit. This is useful in case of transient errors.
  74 *
  75 * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all
  76 *   the fast commits happening between ext4_fc_start_ineligible() and
  77 *   ext4_fc_stop_ineligible() and one fast commit after the call to
  78 *   ext4_fc_stop_ineligible() to fall back to full commits. It is important to
  79 *   make one more fast commit to fall back to full commit after stop call so
  80 *   that it guaranteed that the fast commit ineligible operation contained
  81 *   within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is
  82 *   followed by at least 1 full commit.
  83 *
  84 * Atomicity of commits
  85 * --------------------
  86 * In order to guarantee atomicity during the commit operation, fast commit
  87 * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
  88 * tag contains CRC of the contents and TID of the transaction after which
  89 * this fast commit should be applied. Recovery code replays fast commit
  90 * logs only if there's at least 1 valid tail present. For every fast commit
  91 * operation, there is 1 tail. This means, we may end up with multiple tails
  92 * in the fast commit space. Here's an example:
  93 *
  94 * - Create a new file A and remove existing file B
  95 * - fsync()
  96 * - Append contents to file A
  97 * - Truncate file A
  98 * - fsync()
  99 *
 100 * The fast commit space at the end of above operations would look like this:
 101 *      [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
 102 *             |<---  Fast Commit 1   --->|<---      Fast Commit 2     ---->|
 103 *
 104 * Replay code should thus check for all the valid tails in the FC area.
 105 *
 106 * Fast Commit Replay Idempotence
 107 * ------------------------------
 108 *
 109 * Fast commits tags are idempotent in nature provided the recovery code follows
 110 * certain rules. The guiding principle that the commit path follows while
 111 * committing is that it stores the result of a particular operation instead of
 112 * storing the procedure.
 113 *
 114 * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
 115 * was associated with inode 10. During fast commit, instead of storing this
 116 * operation as a procedure "rename a to b", we store the resulting file system
 117 * state as a "series" of outcomes:
 118 *
 119 * - Link dirent b to inode 10
 120 * - Unlink dirent a
 121 * - Inode <10> with valid refcount
 122 *
 123 * Now when recovery code runs, it needs "enforce" this state on the file
 124 * system. This is what guarantees idempotence of fast commit replay.
 125 *
 126 * Let's take an example of a procedure that is not idempotent and see how fast
 127 * commits make it idempotent. Consider following sequence of operations:
 128 *
 129 *     rm A;    mv B A;    read A
 130 *  (x)     (y)        (z)
 131 *
 132 * (x), (y) and (z) are the points at which we can crash. If we store this
 133 * sequence of operations as is then the replay is not idempotent. Let's say
 134 * while in replay, we crash at (z). During the second replay, file A (which was
 135 * actually created as a result of "mv B A" operation) would get deleted. Thus,
 136 * file named A would be absent when we try to read A. So, this sequence of
 137 * operations is not idempotent. However, as mentioned above, instead of storing
 138 * the procedure fast commits store the outcome of each procedure. Thus the fast
 139 * commit log for above procedure would be as follows:
 140 *
 141 * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
 142 * inode 11 before the replay)
 143 *
 144 *    [Unlink A]   [Link A to inode 11]   [Unlink B]   [Inode 11]
 145 * (w)          (x)                    (y)          (z)
 146 *
 147 * If we crash at (z), we will have file A linked to inode 11. During the second
 148 * replay, we will remove file A (inode 11). But we will create it back and make
 149 * it point to inode 11. We won't find B, so we'll just skip that step. At this
 150 * point, the refcount for inode 11 is not reliable, but that gets fixed by the
 151 * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
 152 * similarly. Thus, by converting a non-idempotent procedure into a series of
 153 * idempotent outcomes, fast commits ensured idempotence during the replay.
 154 *
 155 * TODOs
 156 * -----
 157 *
 158 * 0) Fast commit replay path hardening: Fast commit replay code should use
 159 *    journal handles to make sure all the updates it does during the replay
 160 *    path are atomic. With that if we crash during fast commit replay, after
 161 *    trying to do recovery again, we will find a file system where fast commit
 162 *    area is invalid (because new full commit would be found). In order to deal
 163 *    with that, fast commit replay code should ensure that the "FC_REPLAY"
 164 *    superblock state is persisted before starting the replay, so that after
 165 *    the crash, fast commit recovery code can look at that flag and perform
 166 *    fast commit recovery even if that area is invalidated by later full
 167 *    commits.
 168 *
 169 * 1) Make fast commit atomic updates more fine grained. Today, a fast commit
 170 *    eligible update must be protected within ext4_fc_start_update() and
 171 *    ext4_fc_stop_update(). These routines are called at much higher
 172 *    routines. This can be made more fine grained by combining with
 173 *    ext4_journal_start().
 174 *
 175 * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible()
 176 *
 177 * 3) Handle more ineligible cases.
 178 */
 179
 180#include <trace/events/ext4.h>
 181static struct kmem_cache *ext4_fc_dentry_cachep;
 182
 183static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 184{
 185        BUFFER_TRACE(bh, "");
 186        if (uptodate) {
 187                ext4_debug("%s: Block %lld up-to-date",
 188                           __func__, bh->b_blocknr);
 189                set_buffer_uptodate(bh);
 190        } else {
 191                ext4_debug("%s: Block %lld not up-to-date",
 192                           __func__, bh->b_blocknr);
 193                clear_buffer_uptodate(bh);
 194        }
 195
 196        unlock_buffer(bh);
 197}
 198
 199static inline void ext4_fc_reset_inode(struct inode *inode)
 200{
 201        struct ext4_inode_info *ei = EXT4_I(inode);
 202
 203        ei->i_fc_lblk_start = 0;
 204        ei->i_fc_lblk_len = 0;
 205}
 206
 207void ext4_fc_init_inode(struct inode *inode)
 208{
 209        struct ext4_inode_info *ei = EXT4_I(inode);
 210
 211        ext4_fc_reset_inode(inode);
 212        ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
 213        INIT_LIST_HEAD(&ei->i_fc_list);
 214        init_waitqueue_head(&ei->i_fc_wait);
 215        atomic_set(&ei->i_fc_updates, 0);
 216}
 217
 218/* This function must be called with sbi->s_fc_lock held. */
 219static void ext4_fc_wait_committing_inode(struct inode *inode)
 220__releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
 221{
 222        wait_queue_head_t *wq;
 223        struct ext4_inode_info *ei = EXT4_I(inode);
 224
 225#if (BITS_PER_LONG < 64)
 226        DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
 227                        EXT4_STATE_FC_COMMITTING);
 228        wq = bit_waitqueue(&ei->i_state_flags,
 229                                EXT4_STATE_FC_COMMITTING);
 230#else
 231        DEFINE_WAIT_BIT(wait, &ei->i_flags,
 232                        EXT4_STATE_FC_COMMITTING);
 233        wq = bit_waitqueue(&ei->i_flags,
 234                                EXT4_STATE_FC_COMMITTING);
 235#endif
 236        lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
 237        prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
 238        spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 239        schedule();
 240        finish_wait(wq, &wait.wq_entry);
 241}
 242
 243/*
 244 * Inform Ext4's fast about start of an inode update
 245 *
 246 * This function is called by the high level call VFS callbacks before
 247 * performing any inode update. This function blocks if there's an ongoing
 248 * fast commit on the inode in question.
 249 */
 250void ext4_fc_start_update(struct inode *inode)
 251{
 252        struct ext4_inode_info *ei = EXT4_I(inode);
 253
 254        if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 255            (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 256                return;
 257
 258restart:
 259        spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 260        if (list_empty(&ei->i_fc_list))
 261                goto out;
 262
 263        if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
 264                ext4_fc_wait_committing_inode(inode);
 265                goto restart;
 266        }
 267out:
 268        atomic_inc(&ei->i_fc_updates);
 269        spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 270}
 271
 272/*
 273 * Stop inode update and wake up waiting fast commits if any.
 274 */
 275void ext4_fc_stop_update(struct inode *inode)
 276{
 277        struct ext4_inode_info *ei = EXT4_I(inode);
 278
 279        if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 280            (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 281                return;
 282
 283        if (atomic_dec_and_test(&ei->i_fc_updates))
 284                wake_up_all(&ei->i_fc_wait);
 285}
 286
 287/*
 288 * Remove inode from fast commit list. If the inode is being committed
 289 * we wait until inode commit is done.
 290 */
 291void ext4_fc_del(struct inode *inode)
 292{
 293        struct ext4_inode_info *ei = EXT4_I(inode);
 294
 295        if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 296            (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 297                return;
 298
 299restart:
 300        spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 301        if (list_empty(&ei->i_fc_list)) {
 302                spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 303                return;
 304        }
 305
 306        if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
 307                ext4_fc_wait_committing_inode(inode);
 308                goto restart;
 309        }
 310        list_del_init(&ei->i_fc_list);
 311        spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 312}
 313
 314/*
 315 * Mark file system as fast commit ineligible. This means that next commit
 316 * operation would result in a full jbd2 commit.
 317 */
 318void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
 319{
 320        struct ext4_sb_info *sbi = EXT4_SB(sb);
 321
 322        if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
 323            (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
 324                return;
 325
 326        ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
 327        WARN_ON(reason >= EXT4_FC_REASON_MAX);
 328        sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
 329}
 330
 331/*
 332 * Start a fast commit ineligible update. Any commits that happen while
 333 * such an operation is in progress fall back to full commits.
 334 */
 335void ext4_fc_start_ineligible(struct super_block *sb, int reason)
 336{
 337        struct ext4_sb_info *sbi = EXT4_SB(sb);
 338
 339        if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
 340            (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
 341                return;
 342
 343        WARN_ON(reason >= EXT4_FC_REASON_MAX);
 344        sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
 345        atomic_inc(&sbi->s_fc_ineligible_updates);
 346}
 347
 348/*
 349 * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here
 350 * to ensure that after stopping the ineligible update, at least one full
 351 * commit takes place.
 352 */
 353void ext4_fc_stop_ineligible(struct super_block *sb)
 354{
 355        if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
 356            (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
 357                return;
 358
 359        ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
 360        atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
 361}
 362
 363static inline int ext4_fc_is_ineligible(struct super_block *sb)
 364{
 365        return (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE) ||
 366                atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates));
 367}
 368
 369/*
 370 * Generic fast commit tracking function. If this is the first time this we are
 371 * called after a full commit, we initialize fast commit fields and then call
 372 * __fc_track_fn() with update = 0. If we have already been called after a full
 373 * commit, we pass update = 1. Based on that, the track function can determine
 374 * if it needs to track a field for the first time or if it needs to just
 375 * update the previously tracked value.
 376 *
 377 * If enqueue is set, this function enqueues the inode in fast commit list.
 378 */
 379static int ext4_fc_track_template(
 380        handle_t *handle, struct inode *inode,
 381        int (*__fc_track_fn)(struct inode *, void *, bool),
 382        void *args, int enqueue)
 383{
 384        bool update = false;
 385        struct ext4_inode_info *ei = EXT4_I(inode);
 386        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 387        tid_t tid = 0;
 388        int ret;
 389
 390        if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 391            (sbi->s_mount_state & EXT4_FC_REPLAY))
 392                return -EOPNOTSUPP;
 393
 394        if (ext4_fc_is_ineligible(inode->i_sb))
 395                return -EINVAL;
 396
 397        tid = handle->h_transaction->t_tid;
 398        mutex_lock(&ei->i_fc_lock);
 399        if (tid == ei->i_sync_tid) {
 400                update = true;
 401        } else {
 402                ext4_fc_reset_inode(inode);
 403                ei->i_sync_tid = tid;
 404        }
 405        ret = __fc_track_fn(inode, args, update);
 406        mutex_unlock(&ei->i_fc_lock);
 407
 408        if (!enqueue)
 409                return ret;
 410
 411        spin_lock(&sbi->s_fc_lock);
 412        if (list_empty(&EXT4_I(inode)->i_fc_list))
 413                list_add_tail(&EXT4_I(inode)->i_fc_list,
 414                                (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ?
 415                                &sbi->s_fc_q[FC_Q_STAGING] :
 416                                &sbi->s_fc_q[FC_Q_MAIN]);
 417        spin_unlock(&sbi->s_fc_lock);
 418
 419        return ret;
 420}
 421
 422struct __track_dentry_update_args {
 423        struct dentry *dentry;
 424        int op;
 425};
 426
 427/* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
 428static int __track_dentry_update(struct inode *inode, void *arg, bool update)
 429{
 430        struct ext4_fc_dentry_update *node;
 431        struct ext4_inode_info *ei = EXT4_I(inode);
 432        struct __track_dentry_update_args *dentry_update =
 433                (struct __track_dentry_update_args *)arg;
 434        struct dentry *dentry = dentry_update->dentry;
 435        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 436
 437        mutex_unlock(&ei->i_fc_lock);
 438        node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
 439        if (!node) {
 440                ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
 441                mutex_lock(&ei->i_fc_lock);
 442                return -ENOMEM;
 443        }
 444
 445        node->fcd_op = dentry_update->op;
 446        node->fcd_parent = dentry->d_parent->d_inode->i_ino;
 447        node->fcd_ino = inode->i_ino;
 448        if (dentry->d_name.len > DNAME_INLINE_LEN) {
 449                node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
 450                if (!node->fcd_name.name) {
 451                        kmem_cache_free(ext4_fc_dentry_cachep, node);
 452                        ext4_fc_mark_ineligible(inode->i_sb,
 453                                EXT4_FC_REASON_NOMEM);
 454                        mutex_lock(&ei->i_fc_lock);
 455                        return -ENOMEM;
 456                }
 457                memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
 458                        dentry->d_name.len);
 459        } else {
 460                memcpy(node->fcd_iname, dentry->d_name.name,
 461                        dentry->d_name.len);
 462                node->fcd_name.name = node->fcd_iname;
 463        }
 464        node->fcd_name.len = dentry->d_name.len;
 465
 466        spin_lock(&sbi->s_fc_lock);
 467        if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING))
 468                list_add_tail(&node->fcd_list,
 469                                &sbi->s_fc_dentry_q[FC_Q_STAGING]);
 470        else
 471                list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
 472        spin_unlock(&sbi->s_fc_lock);
 473        mutex_lock(&ei->i_fc_lock);
 474
 475        return 0;
 476}
 477
 478void __ext4_fc_track_unlink(handle_t *handle,
 479                struct inode *inode, struct dentry *dentry)
 480{
 481        struct __track_dentry_update_args args;
 482        int ret;
 483
 484        args.dentry = dentry;
 485        args.op = EXT4_FC_TAG_UNLINK;
 486
 487        ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 488                                        (void *)&args, 0);
 489        trace_ext4_fc_track_unlink(inode, dentry, ret);
 490}
 491
 492void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
 493{
 494        __ext4_fc_track_unlink(handle, d_inode(dentry), dentry);
 495}
 496
 497void __ext4_fc_track_link(handle_t *handle,
 498        struct inode *inode, struct dentry *dentry)
 499{
 500        struct __track_dentry_update_args args;
 501        int ret;
 502
 503        args.dentry = dentry;
 504        args.op = EXT4_FC_TAG_LINK;
 505
 506        ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 507                                        (void *)&args, 0);
 508        trace_ext4_fc_track_link(inode, dentry, ret);
 509}
 510
 511void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
 512{
 513        __ext4_fc_track_link(handle, d_inode(dentry), dentry);
 514}
 515
 516void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
 517                          struct dentry *dentry)
 518{
 519        struct __track_dentry_update_args args;
 520        int ret;
 521
 522        args.dentry = dentry;
 523        args.op = EXT4_FC_TAG_CREAT;
 524
 525        ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 526                                        (void *)&args, 0);
 527        trace_ext4_fc_track_create(inode, dentry, ret);
 528}
 529
 530void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
 531{
 532        __ext4_fc_track_create(handle, d_inode(dentry), dentry);
 533}
 534
 535/* __track_fn for inode tracking */
 536static int __track_inode(struct inode *inode, void *arg, bool update)
 537{
 538        if (update)
 539                return -EEXIST;
 540
 541        EXT4_I(inode)->i_fc_lblk_len = 0;
 542
 543        return 0;
 544}
 545
 546void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
 547{
 548        int ret;
 549
 550        if (S_ISDIR(inode->i_mode))
 551                return;
 552
 553        if (ext4_should_journal_data(inode)) {
 554                ext4_fc_mark_ineligible(inode->i_sb,
 555                                        EXT4_FC_REASON_INODE_JOURNAL_DATA);
 556                return;
 557        }
 558
 559        ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
 560        trace_ext4_fc_track_inode(inode, ret);
 561}
 562
 563struct __track_range_args {
 564        ext4_lblk_t start, end;
 565};
 566
 567/* __track_fn for tracking data updates */
 568static int __track_range(struct inode *inode, void *arg, bool update)
 569{
 570        struct ext4_inode_info *ei = EXT4_I(inode);
 571        ext4_lblk_t oldstart;
 572        struct __track_range_args *__arg =
 573                (struct __track_range_args *)arg;
 574
 575        if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
 576                ext4_debug("Special inode %ld being modified\n", inode->i_ino);
 577                return -ECANCELED;
 578        }
 579
 580        oldstart = ei->i_fc_lblk_start;
 581
 582        if (update && ei->i_fc_lblk_len > 0) {
 583                ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
 584                ei->i_fc_lblk_len =
 585                        max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
 586                                ei->i_fc_lblk_start + 1;
 587        } else {
 588                ei->i_fc_lblk_start = __arg->start;
 589                ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
 590        }
 591
 592        return 0;
 593}
 594
 595void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
 596                         ext4_lblk_t end)
 597{
 598        struct __track_range_args args;
 599        int ret;
 600
 601        if (S_ISDIR(inode->i_mode))
 602                return;
 603
 604        args.start = start;
 605        args.end = end;
 606
 607        ret = ext4_fc_track_template(handle, inode,  __track_range, &args, 1);
 608
 609        trace_ext4_fc_track_range(inode, start, end, ret);
 610}
 611
 612static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
 613{
 614        int write_flags = REQ_SYNC;
 615        struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
 616
 617        /* Add REQ_FUA | REQ_PREFLUSH only its tail */
 618        if (test_opt(sb, BARRIER) && is_tail)
 619                write_flags |= REQ_FUA | REQ_PREFLUSH;
 620        lock_buffer(bh);
 621        set_buffer_dirty(bh);
 622        set_buffer_uptodate(bh);
 623        bh->b_end_io = ext4_end_buffer_io_sync;
 624        submit_bh(REQ_OP_WRITE, write_flags, bh);
 625        EXT4_SB(sb)->s_fc_bh = NULL;
 626}
 627
 628/* Ext4 commit path routines */
 629
 630/* memzero and update CRC */
 631static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
 632                                u32 *crc)
 633{
 634        void *ret;
 635
 636        ret = memset(dst, 0, len);
 637        if (crc)
 638                *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
 639        return ret;
 640}
 641
 642/*
 643 * Allocate len bytes on a fast commit buffer.
 644 *
 645 * During the commit time this function is used to manage fast commit
 646 * block space. We don't split a fast commit log onto different
 647 * blocks. So this function makes sure that if there's not enough space
 648 * on the current block, the remaining space in the current block is
 649 * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
 650 * new block is from jbd2 and CRC is updated to reflect the padding
 651 * we added.
 652 */
 653static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
 654{
 655        struct ext4_fc_tl *tl;
 656        struct ext4_sb_info *sbi = EXT4_SB(sb);
 657        struct buffer_head *bh;
 658        int bsize = sbi->s_journal->j_blocksize;
 659        int ret, off = sbi->s_fc_bytes % bsize;
 660        int pad_len;
 661
 662        /*
 663         * After allocating len, we should have space at least for a 0 byte
 664         * padding.
 665         */
 666        if (len + sizeof(struct ext4_fc_tl) > bsize)
 667                return NULL;
 668
 669        if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
 670                /*
 671                 * Only allocate from current buffer if we have enough space for
 672                 * this request AND we have space to add a zero byte padding.
 673                 */
 674                if (!sbi->s_fc_bh) {
 675                        ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
 676                        if (ret)
 677                                return NULL;
 678                        sbi->s_fc_bh = bh;
 679                }
 680                sbi->s_fc_bytes += len;
 681                return sbi->s_fc_bh->b_data + off;
 682        }
 683        /* Need to add PAD tag */
 684        tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
 685        tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
 686        pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
 687        tl->fc_len = cpu_to_le16(pad_len);
 688        if (crc)
 689                *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
 690        if (pad_len > 0)
 691                ext4_fc_memzero(sb, tl + 1, pad_len, crc);
 692        ext4_fc_submit_bh(sb, false);
 693
 694        ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
 695        if (ret)
 696                return NULL;
 697        sbi->s_fc_bh = bh;
 698        sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
 699        return sbi->s_fc_bh->b_data;
 700}
 701
 702/* memcpy to fc reserved space and update CRC */
 703static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
 704                                int len, u32 *crc)
 705{
 706        if (crc)
 707                *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
 708        return memcpy(dst, src, len);
 709}
 710
 711/*
 712 * Complete a fast commit by writing tail tag.
 713 *
 714 * Writing tail tag marks the end of a fast commit. In order to guarantee
 715 * atomicity, after writing tail tag, even if there's space remaining
 716 * in the block, next commit shouldn't use it. That's why tail tag
 717 * has the length as that of the remaining space on the block.
 718 */
 719static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
 720{
 721        struct ext4_sb_info *sbi = EXT4_SB(sb);
 722        struct ext4_fc_tl tl;
 723        struct ext4_fc_tail tail;
 724        int off, bsize = sbi->s_journal->j_blocksize;
 725        u8 *dst;
 726
 727        /*
 728         * ext4_fc_reserve_space takes care of allocating an extra block if
 729         * there's no enough space on this block for accommodating this tail.
 730         */
 731        dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
 732        if (!dst)
 733                return -ENOSPC;
 734
 735        off = sbi->s_fc_bytes % bsize;
 736
 737        tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
 738        tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
 739        sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
 740
 741        ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
 742        dst += sizeof(tl);
 743        tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
 744        ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
 745        dst += sizeof(tail.fc_tid);
 746        tail.fc_crc = cpu_to_le32(crc);
 747        ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
 748
 749        ext4_fc_submit_bh(sb, true);
 750
 751        return 0;
 752}
 753
 754/*
 755 * Adds tag, length, value and updates CRC. Returns true if tlv was added.
 756 * Returns false if there's not enough space.
 757 */
 758static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
 759                           u32 *crc)
 760{
 761        struct ext4_fc_tl tl;
 762        u8 *dst;
 763
 764        dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
 765        if (!dst)
 766                return false;
 767
 768        tl.fc_tag = cpu_to_le16(tag);
 769        tl.fc_len = cpu_to_le16(len);
 770
 771        ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
 772        ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
 773
 774        return true;
 775}
 776
 777/* Same as above, but adds dentry tlv. */
 778static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
 779                                   struct ext4_fc_dentry_update *fc_dentry)
 780{
 781        struct ext4_fc_dentry_info fcd;
 782        struct ext4_fc_tl tl;
 783        int dlen = fc_dentry->fcd_name.len;
 784        u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
 785                                        crc);
 786
 787        if (!dst)
 788                return false;
 789
 790        fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent);
 791        fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino);
 792        tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op);
 793        tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
 794        ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
 795        dst += sizeof(tl);
 796        ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
 797        dst += sizeof(fcd);
 798        ext4_fc_memcpy(sb, dst, fc_dentry->fcd_name.name, dlen, crc);
 799        dst += dlen;
 800
 801        return true;
 802}
 803
 804/*
 805 * Writes inode in the fast commit space under TLV with tag @tag.
 806 * Returns 0 on success, error on failure.
 807 */
 808static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
 809{
 810        struct ext4_inode_info *ei = EXT4_I(inode);
 811        int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
 812        int ret;
 813        struct ext4_iloc iloc;
 814        struct ext4_fc_inode fc_inode;
 815        struct ext4_fc_tl tl;
 816        u8 *dst;
 817
 818        ret = ext4_get_inode_loc(inode, &iloc);
 819        if (ret)
 820                return ret;
 821
 822        if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
 823                inode_len = EXT4_INODE_SIZE(inode->i_sb);
 824        else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
 825                inode_len += ei->i_extra_isize;
 826
 827        fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
 828        tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
 829        tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
 830
 831        dst = ext4_fc_reserve_space(inode->i_sb,
 832                        sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
 833        if (!dst)
 834                return -ECANCELED;
 835
 836        if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
 837                return -ECANCELED;
 838        dst += sizeof(tl);
 839        if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
 840                return -ECANCELED;
 841        dst += sizeof(fc_inode);
 842        if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
 843                                        inode_len, crc))
 844                return -ECANCELED;
 845
 846        return 0;
 847}
 848
 849/*
 850 * Writes updated data ranges for the inode in question. Updates CRC.
 851 * Returns 0 on success, error otherwise.
 852 */
 853static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
 854{
 855        ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
 856        struct ext4_inode_info *ei = EXT4_I(inode);
 857        struct ext4_map_blocks map;
 858        struct ext4_fc_add_range fc_ext;
 859        struct ext4_fc_del_range lrange;
 860        struct ext4_extent *ex;
 861        int ret;
 862
 863        mutex_lock(&ei->i_fc_lock);
 864        if (ei->i_fc_lblk_len == 0) {
 865                mutex_unlock(&ei->i_fc_lock);
 866                return 0;
 867        }
 868        old_blk_size = ei->i_fc_lblk_start;
 869        new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
 870        ei->i_fc_lblk_len = 0;
 871        mutex_unlock(&ei->i_fc_lock);
 872
 873        cur_lblk_off = old_blk_size;
 874        jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
 875                  __func__, cur_lblk_off, new_blk_size, inode->i_ino);
 876
 877        while (cur_lblk_off <= new_blk_size) {
 878                map.m_lblk = cur_lblk_off;
 879                map.m_len = new_blk_size - cur_lblk_off + 1;
 880                ret = ext4_map_blocks(NULL, inode, &map, 0);
 881                if (ret < 0)
 882                        return -ECANCELED;
 883
 884                if (map.m_len == 0) {
 885                        cur_lblk_off++;
 886                        continue;
 887                }
 888
 889                if (ret == 0) {
 890                        lrange.fc_ino = cpu_to_le32(inode->i_ino);
 891                        lrange.fc_lblk = cpu_to_le32(map.m_lblk);
 892                        lrange.fc_len = cpu_to_le32(map.m_len);
 893                        if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
 894                                            sizeof(lrange), (u8 *)&lrange, crc))
 895                                return -ENOSPC;
 896                } else {
 897                        unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
 898                                EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;
 899
 900                        /* Limit the number of blocks in one extent */
 901                        map.m_len = min(max, map.m_len);
 902
 903                        fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
 904                        ex = (struct ext4_extent *)&fc_ext.fc_ex;
 905                        ex->ee_block = cpu_to_le32(map.m_lblk);
 906                        ex->ee_len = cpu_to_le16(map.m_len);
 907                        ext4_ext_store_pblock(ex, map.m_pblk);
 908                        if (map.m_flags & EXT4_MAP_UNWRITTEN)
 909                                ext4_ext_mark_unwritten(ex);
 910                        else
 911                                ext4_ext_mark_initialized(ex);
 912                        if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
 913                                            sizeof(fc_ext), (u8 *)&fc_ext, crc))
 914                                return -ENOSPC;
 915                }
 916
 917                cur_lblk_off += map.m_len;
 918        }
 919
 920        return 0;
 921}
 922
 923
 924/* Submit data for all the fast commit inodes */
 925static int ext4_fc_submit_inode_data_all(journal_t *journal)
 926{
 927        struct super_block *sb = (struct super_block *)(journal->j_private);
 928        struct ext4_sb_info *sbi = EXT4_SB(sb);
 929        struct ext4_inode_info *ei;
 930        int ret = 0;
 931
 932        spin_lock(&sbi->s_fc_lock);
 933        ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING);
 934        list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
 935                ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
 936                while (atomic_read(&ei->i_fc_updates)) {
 937                        DEFINE_WAIT(wait);
 938
 939                        prepare_to_wait(&ei->i_fc_wait, &wait,
 940                                                TASK_UNINTERRUPTIBLE);
 941                        if (atomic_read(&ei->i_fc_updates)) {
 942                                spin_unlock(&sbi->s_fc_lock);
 943                                schedule();
 944                                spin_lock(&sbi->s_fc_lock);
 945                        }
 946                        finish_wait(&ei->i_fc_wait, &wait);
 947                }
 948                spin_unlock(&sbi->s_fc_lock);
 949                ret = jbd2_submit_inode_data(ei->jinode);
 950                if (ret)
 951                        return ret;
 952                spin_lock(&sbi->s_fc_lock);
 953        }
 954        spin_unlock(&sbi->s_fc_lock);
 955
 956        return ret;
 957}
 958
 959/* Wait for completion of data for all the fast commit inodes */
 960static int ext4_fc_wait_inode_data_all(journal_t *journal)
 961{
 962        struct super_block *sb = (struct super_block *)(journal->j_private);
 963        struct ext4_sb_info *sbi = EXT4_SB(sb);
 964        struct ext4_inode_info *pos, *n;
 965        int ret = 0;
 966
 967        spin_lock(&sbi->s_fc_lock);
 968        list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
 969                if (!ext4_test_inode_state(&pos->vfs_inode,
 970                                           EXT4_STATE_FC_COMMITTING))
 971                        continue;
 972                spin_unlock(&sbi->s_fc_lock);
 973
 974                ret = jbd2_wait_inode_data(journal, pos->jinode);
 975                if (ret)
 976                        return ret;
 977                spin_lock(&sbi->s_fc_lock);
 978        }
 979        spin_unlock(&sbi->s_fc_lock);
 980
 981        return 0;
 982}
 983
 984/* Commit all the directory entry updates */
 985static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
 986__acquires(&sbi->s_fc_lock)
 987__releases(&sbi->s_fc_lock)
 988{
 989        struct super_block *sb = (struct super_block *)(journal->j_private);
 990        struct ext4_sb_info *sbi = EXT4_SB(sb);
 991        struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n;
 992        struct inode *inode;
 993        struct ext4_inode_info *ei, *ei_n;
 994        int ret;
 995
 996        if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
 997                return 0;
 998        list_for_each_entry_safe(fc_dentry, fc_dentry_n,
 999                                 &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
1000                if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
1001                        spin_unlock(&sbi->s_fc_lock);
1002                        if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
1003                                ret = -ENOSPC;
1004                                goto lock_and_exit;
1005                        }
1006                        spin_lock(&sbi->s_fc_lock);
1007                        continue;
1008                }
1009
1010                inode = NULL;
1011                list_for_each_entry_safe(ei, ei_n, &sbi->s_fc_q[FC_Q_MAIN],
1012                                         i_fc_list) {
1013                        if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
1014                                inode = &ei->vfs_inode;
1015                                break;
1016                        }
1017                }
1018                /*
1019                 * If we don't find inode in our list, then it was deleted,
1020                 * in which case, we don't need to record it's create tag.
1021                 */
1022                if (!inode)
1023                        continue;
1024                spin_unlock(&sbi->s_fc_lock);
1025
1026                /*
1027                 * We first write the inode and then the create dirent. This
1028                 * allows the recovery code to create an unnamed inode first
1029                 * and then link it to a directory entry. This allows us
1030                 * to use namei.c routines almost as is and simplifies
1031                 * the recovery code.
1032                 */
1033                ret = ext4_fc_write_inode(inode, crc);
1034                if (ret)
1035                        goto lock_and_exit;
1036
1037                ret = ext4_fc_write_inode_data(inode, crc);
1038                if (ret)
1039                        goto lock_and_exit;
1040
1041                if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
1042                        ret = -ENOSPC;
1043                        goto lock_and_exit;
1044                }
1045
1046                spin_lock(&sbi->s_fc_lock);
1047        }
1048        return 0;
1049lock_and_exit:
1050        spin_lock(&sbi->s_fc_lock);
1051        return ret;
1052}
1053
1054static int ext4_fc_perform_commit(journal_t *journal)
1055{
1056        struct super_block *sb = (struct super_block *)(journal->j_private);
1057        struct ext4_sb_info *sbi = EXT4_SB(sb);
1058        struct ext4_inode_info *iter;
1059        struct ext4_fc_head head;
1060        struct inode *inode;
1061        struct blk_plug plug;
1062        int ret = 0;
1063        u32 crc = 0;
1064
1065        ret = ext4_fc_submit_inode_data_all(journal);
1066        if (ret)
1067                return ret;
1068
1069        ret = ext4_fc_wait_inode_data_all(journal);
1070        if (ret)
1071                return ret;
1072
1073        /*
1074         * If file system device is different from journal device, issue a cache
1075         * flush before we start writing fast commit blocks.
1076         */
1077        if (journal->j_fs_dev != journal->j_dev)
1078                blkdev_issue_flush(journal->j_fs_dev);
1079
1080        blk_start_plug(&plug);
1081        if (sbi->s_fc_bytes == 0) {
1082                /*
1083                 * Add a head tag only if this is the first fast commit
1084                 * in this TID.
1085                 */
1086                head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1087                head.fc_tid = cpu_to_le32(
1088                        sbi->s_journal->j_running_transaction->t_tid);
1089                if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1090                        (u8 *)&head, &crc)) {
1091                        ret = -ENOSPC;
1092                        goto out;
1093                }
1094        }
1095
1096        spin_lock(&sbi->s_fc_lock);
1097        ret = ext4_fc_commit_dentry_updates(journal, &crc);
1098        if (ret) {
1099                spin_unlock(&sbi->s_fc_lock);
1100                goto out;
1101        }
1102
1103        list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1104                inode = &iter->vfs_inode;
1105                if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1106                        continue;
1107
1108                spin_unlock(&sbi->s_fc_lock);
1109                ret = ext4_fc_write_inode_data(inode, &crc);
1110                if (ret)
1111                        goto out;
1112                ret = ext4_fc_write_inode(inode, &crc);
1113                if (ret)
1114                        goto out;
1115                spin_lock(&sbi->s_fc_lock);
1116        }
1117        spin_unlock(&sbi->s_fc_lock);
1118
1119        ret = ext4_fc_write_tail(sb, crc);
1120
1121out:
1122        blk_finish_plug(&plug);
1123        return ret;
1124}
1125
1126/*
1127 * The main commit entry point. Performs a fast commit for transaction
1128 * commit_tid if needed. If it's not possible to perform a fast commit
1129 * due to various reasons, we fall back to full commit. Returns 0
1130 * on success, error otherwise.
1131 */
1132int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1133{
1134        struct super_block *sb = (struct super_block *)(journal->j_private);
1135        struct ext4_sb_info *sbi = EXT4_SB(sb);
1136        int nblks = 0, ret, bsize = journal->j_blocksize;
1137        int subtid = atomic_read(&sbi->s_fc_subtid);
1138        int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0;
1139        ktime_t start_time, commit_time;
1140
1141        trace_ext4_fc_commit_start(sb);
1142
1143        start_time = ktime_get();
1144
1145        if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
1146                (ext4_fc_is_ineligible(sb))) {
1147                reason = EXT4_FC_REASON_INELIGIBLE;
1148                goto out;
1149        }
1150
1151restart_fc:
1152        ret = jbd2_fc_begin_commit(journal, commit_tid);
1153        if (ret == -EALREADY) {
1154                /* There was an ongoing commit, check if we need to restart */
1155                if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1156                        commit_tid > journal->j_commit_sequence)
1157                        goto restart_fc;
1158                reason = EXT4_FC_REASON_ALREADY_COMMITTED;
1159                goto out;
1160        } else if (ret) {
1161                sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1162                reason = EXT4_FC_REASON_FC_START_FAILED;
1163                goto out;
1164        }
1165
1166        fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1167        ret = ext4_fc_perform_commit(journal);
1168        if (ret < 0) {
1169                sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1170                reason = EXT4_FC_REASON_FC_FAILED;
1171                goto out;
1172        }
1173        nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1174        ret = jbd2_fc_wait_bufs(journal, nblks);
1175        if (ret < 0) {
1176                sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1177                reason = EXT4_FC_REASON_FC_FAILED;
1178                goto out;
1179        }
1180        atomic_inc(&sbi->s_fc_subtid);
1181        jbd2_fc_end_commit(journal);
1182out:
1183        /* Has any ineligible update happened since we started? */
1184        if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) {
1185                sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1186                reason = EXT4_FC_REASON_INELIGIBLE;
1187        }
1188
1189        spin_lock(&sbi->s_fc_lock);
1190        if (reason != EXT4_FC_REASON_OK &&
1191                reason != EXT4_FC_REASON_ALREADY_COMMITTED) {
1192                sbi->s_fc_stats.fc_ineligible_commits++;
1193        } else {
1194                sbi->s_fc_stats.fc_num_commits++;
1195                sbi->s_fc_stats.fc_numblks += nblks;
1196        }
1197        spin_unlock(&sbi->s_fc_lock);
1198        nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0;
1199        trace_ext4_fc_commit_stop(sb, nblks, reason);
1200        commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1201        /*
1202         * weight the commit time higher than the average time so we don't
1203         * react too strongly to vast changes in the commit time
1204         */
1205        if (likely(sbi->s_fc_avg_commit_time))
1206                sbi->s_fc_avg_commit_time = (commit_time +
1207                                sbi->s_fc_avg_commit_time * 3) / 4;
1208        else
1209                sbi->s_fc_avg_commit_time = commit_time;
1210        jbd_debug(1,
1211                "Fast commit ended with blks = %d, reason = %d, subtid - %d",
1212                nblks, reason, subtid);
1213        if (reason == EXT4_FC_REASON_FC_FAILED)
1214                return jbd2_fc_end_commit_fallback(journal);
1215        if (reason == EXT4_FC_REASON_FC_START_FAILED ||
1216                reason == EXT4_FC_REASON_INELIGIBLE)
1217                return jbd2_complete_transaction(journal, commit_tid);
1218        return 0;
1219}
1220
1221/*
1222 * Fast commit cleanup routine. This is called after every fast commit and
1223 * full commit. full is true if we are called after a full commit.
1224 */
1225static void ext4_fc_cleanup(journal_t *journal, int full)
1226{
1227        struct super_block *sb = journal->j_private;
1228        struct ext4_sb_info *sbi = EXT4_SB(sb);
1229        struct ext4_inode_info *iter, *iter_n;
1230        struct ext4_fc_dentry_update *fc_dentry;
1231
1232        if (full && sbi->s_fc_bh)
1233                sbi->s_fc_bh = NULL;
1234
1235        jbd2_fc_release_bufs(journal);
1236
1237        spin_lock(&sbi->s_fc_lock);
1238        list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN],
1239                                 i_fc_list) {
1240                list_del_init(&iter->i_fc_list);
1241                ext4_clear_inode_state(&iter->vfs_inode,
1242                                       EXT4_STATE_FC_COMMITTING);
1243                ext4_fc_reset_inode(&iter->vfs_inode);
1244                /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1245                smp_mb();
1246#if (BITS_PER_LONG < 64)
1247                wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1248#else
1249                wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1250#endif
1251        }
1252
1253        while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1254                fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1255                                             struct ext4_fc_dentry_update,
1256                                             fcd_list);
1257                list_del_init(&fc_dentry->fcd_list);
1258                spin_unlock(&sbi->s_fc_lock);
1259
1260                if (fc_dentry->fcd_name.name &&
1261                        fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1262                        kfree(fc_dentry->fcd_name.name);
1263                kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1264                spin_lock(&sbi->s_fc_lock);
1265        }
1266
1267        list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1268                                &sbi->s_fc_dentry_q[FC_Q_MAIN]);
1269        list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1270                                &sbi->s_fc_q[FC_Q_MAIN]);
1271
1272        ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING);
1273        ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
1274
1275        if (full)
1276                sbi->s_fc_bytes = 0;
1277        spin_unlock(&sbi->s_fc_lock);
1278        trace_ext4_fc_stats(sb);
1279}
1280
1281/* Ext4 Replay Path Routines */
1282
1283/* Helper struct for dentry replay routines */
1284struct dentry_info_args {
1285        int parent_ino, dname_len, ino, inode_len;
1286        char *dname;
1287};
1288
1289static inline void tl_to_darg(struct dentry_info_args *darg,
1290                              struct  ext4_fc_tl *tl, u8 *val)
1291{
1292        struct ext4_fc_dentry_info fcd;
1293
1294        memcpy(&fcd, val, sizeof(fcd));
1295
1296        darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
1297        darg->ino = le32_to_cpu(fcd.fc_ino);
1298        darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
1299        darg->dname_len = le16_to_cpu(tl->fc_len) -
1300                sizeof(struct ext4_fc_dentry_info);
1301}
1302
1303/* Unlink replay function */
1304static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl,
1305                                 u8 *val)
1306{
1307        struct inode *inode, *old_parent;
1308        struct qstr entry;
1309        struct dentry_info_args darg;
1310        int ret = 0;
1311
1312        tl_to_darg(&darg, tl, val);
1313
1314        trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1315                        darg.parent_ino, darg.dname_len);
1316
1317        entry.name = darg.dname;
1318        entry.len = darg.dname_len;
1319        inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1320
1321        if (IS_ERR(inode)) {
1322                jbd_debug(1, "Inode %d not found", darg.ino);
1323                return 0;
1324        }
1325
1326        old_parent = ext4_iget(sb, darg.parent_ino,
1327                                EXT4_IGET_NORMAL);
1328        if (IS_ERR(old_parent)) {
1329                jbd_debug(1, "Dir with inode  %d not found", darg.parent_ino);
1330                iput(inode);
1331                return 0;
1332        }
1333
1334        ret = __ext4_unlink(NULL, old_parent, &entry, inode);
1335        /* -ENOENT ok coz it might not exist anymore. */
1336        if (ret == -ENOENT)
1337                ret = 0;
1338        iput(old_parent);
1339        iput(inode);
1340        return ret;
1341}
1342
1343static int ext4_fc_replay_link_internal(struct super_block *sb,
1344                                struct dentry_info_args *darg,
1345                                struct inode *inode)
1346{
1347        struct inode *dir = NULL;
1348        struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1349        struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1350        int ret = 0;
1351
1352        dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1353        if (IS_ERR(dir)) {
1354                jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
1355                dir = NULL;
1356                goto out;
1357        }
1358
1359        dentry_dir = d_obtain_alias(dir);
1360        if (IS_ERR(dentry_dir)) {
1361                jbd_debug(1, "Failed to obtain dentry");
1362                dentry_dir = NULL;
1363                goto out;
1364        }
1365
1366        dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1367        if (!dentry_inode) {
1368                jbd_debug(1, "Inode dentry not created.");
1369                ret = -ENOMEM;
1370                goto out;
1371        }
1372
1373        ret = __ext4_link(dir, inode, dentry_inode);
1374        /*
1375         * It's possible that link already existed since data blocks
1376         * for the dir in question got persisted before we crashed OR
1377         * we replayed this tag and crashed before the entire replay
1378         * could complete.
1379         */
1380        if (ret && ret != -EEXIST) {
1381                jbd_debug(1, "Failed to link\n");
1382                goto out;
1383        }
1384
1385        ret = 0;
1386out:
1387        if (dentry_dir) {
1388                d_drop(dentry_dir);
1389                dput(dentry_dir);
1390        } else if (dir) {
1391                iput(dir);
1392        }
1393        if (dentry_inode) {
1394                d_drop(dentry_inode);
1395                dput(dentry_inode);
1396        }
1397
1398        return ret;
1399}
1400
1401/* Link replay function */
1402static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl,
1403                               u8 *val)
1404{
1405        struct inode *inode;
1406        struct dentry_info_args darg;
1407        int ret = 0;
1408
1409        tl_to_darg(&darg, tl, val);
1410        trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1411                        darg.parent_ino, darg.dname_len);
1412
1413        inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1414        if (IS_ERR(inode)) {
1415                jbd_debug(1, "Inode not found.");
1416                return 0;
1417        }
1418
1419        ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1420        iput(inode);
1421        return ret;
1422}
1423
1424/*
1425 * Record all the modified inodes during replay. We use this later to setup
1426 * block bitmaps correctly.
1427 */
1428static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1429{
1430        struct ext4_fc_replay_state *state;
1431        int i;
1432
1433        state = &EXT4_SB(sb)->s_fc_replay_state;
1434        for (i = 0; i < state->fc_modified_inodes_used; i++)
1435                if (state->fc_modified_inodes[i] == ino)
1436                        return 0;
1437        if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1438                state->fc_modified_inodes_size +=
1439                        EXT4_FC_REPLAY_REALLOC_INCREMENT;
1440                state->fc_modified_inodes = krealloc(
1441                                        state->fc_modified_inodes, sizeof(int) *
1442                                        state->fc_modified_inodes_size,
1443                                        GFP_KERNEL);
1444                if (!state->fc_modified_inodes)
1445                        return -ENOMEM;
1446        }
1447        state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1448        return 0;
1449}
1450
1451/*
1452 * Inode replay function
1453 */
1454static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
1455                                u8 *val)
1456{
1457        struct ext4_fc_inode fc_inode;
1458        struct ext4_inode *raw_inode;
1459        struct ext4_inode *raw_fc_inode;
1460        struct inode *inode = NULL;
1461        struct ext4_iloc iloc;
1462        int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
1463        struct ext4_extent_header *eh;
1464
1465        memcpy(&fc_inode, val, sizeof(fc_inode));
1466
1467        ino = le32_to_cpu(fc_inode.fc_ino);
1468        trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1469
1470        inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1471        if (!IS_ERR(inode)) {
1472                ext4_ext_clear_bb(inode);
1473                iput(inode);
1474        }
1475        inode = NULL;
1476
1477        ext4_fc_record_modified_inode(sb, ino);
1478
1479        raw_fc_inode = (struct ext4_inode *)
1480                (val + offsetof(struct ext4_fc_inode, fc_raw_inode));
1481        ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1482        if (ret)
1483                goto out;
1484
1485        inode_len = le16_to_cpu(tl->fc_len) - sizeof(struct ext4_fc_inode);
1486        raw_inode = ext4_raw_inode(&iloc);
1487
1488        memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1489        memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
1490                inode_len - offsetof(struct ext4_inode, i_generation));
1491        if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1492                eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1493                if (eh->eh_magic != EXT4_EXT_MAGIC) {
1494                        memset(eh, 0, sizeof(*eh));
1495                        eh->eh_magic = EXT4_EXT_MAGIC;
1496                        eh->eh_max = cpu_to_le16(
1497                                (sizeof(raw_inode->i_block) -
1498                                 sizeof(struct ext4_extent_header))
1499                                 / sizeof(struct ext4_extent));
1500                }
1501        } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1502                memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1503                        sizeof(raw_inode->i_block));
1504        }
1505
1506        /* Immediately update the inode on disk. */
1507        ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1508        if (ret)
1509                goto out;
1510        ret = sync_dirty_buffer(iloc.bh);
1511        if (ret)
1512                goto out;
1513        ret = ext4_mark_inode_used(sb, ino);
1514        if (ret)
1515                goto out;
1516
1517        /* Given that we just wrote the inode on disk, this SHOULD succeed. */
1518        inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1519        if (IS_ERR(inode)) {
1520                jbd_debug(1, "Inode not found.");
1521                return -EFSCORRUPTED;
1522        }
1523
1524        /*
1525         * Our allocator could have made different decisions than before
1526         * crashing. This should be fixed but until then, we calculate
1527         * the number of blocks the inode.
1528         */
1529        if (!ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
1530                ext4_ext_replay_set_iblocks(inode);
1531
1532        inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1533        ext4_reset_inode_seed(inode);
1534
1535        ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1536        ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1537        sync_dirty_buffer(iloc.bh);
1538        brelse(iloc.bh);
1539out:
1540        iput(inode);
1541        if (!ret)
1542                blkdev_issue_flush(sb->s_bdev);
1543
1544        return 0;
1545}
1546
1547/*
1548 * Dentry create replay function.
1549 *
1550 * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1551 * inode for which we are trying to create a dentry here, should already have
1552 * been replayed before we start here.
1553 */
1554static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl,
1555                                 u8 *val)
1556{
1557        int ret = 0;
1558        struct inode *inode = NULL;
1559        struct inode *dir = NULL;
1560        struct dentry_info_args darg;
1561
1562        tl_to_darg(&darg, tl, val);
1563
1564        trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1565                        darg.parent_ino, darg.dname_len);
1566
1567        /* This takes care of update group descriptor and other metadata */
1568        ret = ext4_mark_inode_used(sb, darg.ino);
1569        if (ret)
1570                goto out;
1571
1572        inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1573        if (IS_ERR(inode)) {
1574                jbd_debug(1, "inode %d not found.", darg.ino);
1575                inode = NULL;
1576                ret = -EINVAL;
1577                goto out;
1578        }
1579
1580        if (S_ISDIR(inode->i_mode)) {
1581                /*
1582                 * If we are creating a directory, we need to make sure that the
1583                 * dot and dot dot dirents are setup properly.
1584                 */
1585                dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1586                if (IS_ERR(dir)) {
1587                        jbd_debug(1, "Dir %d not found.", darg.ino);
1588                        goto out;
1589                }
1590                ret = ext4_init_new_dir(NULL, dir, inode);
1591                iput(dir);
1592                if (ret) {
1593                        ret = 0;
1594                        goto out;
1595                }
1596        }
1597        ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1598        if (ret)
1599                goto out;
1600        set_nlink(inode, 1);
1601        ext4_mark_inode_dirty(NULL, inode);
1602out:
1603        if (inode)
1604                iput(inode);
1605        return ret;
1606}
1607
1608/*
1609 * Record physical disk regions which are in use as per fast commit area. Our
1610 * simple replay phase allocator excludes these regions from allocation.
1611 */
1612static int ext4_fc_record_regions(struct super_block *sb, int ino,
1613                ext4_lblk_t lblk, ext4_fsblk_t pblk, int len)
1614{
1615        struct ext4_fc_replay_state *state;
1616        struct ext4_fc_alloc_region *region;
1617
1618        state = &EXT4_SB(sb)->s_fc_replay_state;
1619        if (state->fc_regions_used == state->fc_regions_size) {
1620                state->fc_regions_size +=
1621                        EXT4_FC_REPLAY_REALLOC_INCREMENT;
1622                state->fc_regions = krealloc(
1623                                        state->fc_regions,
1624                                        state->fc_regions_size *
1625                                        sizeof(struct ext4_fc_alloc_region),
1626                                        GFP_KERNEL);
1627                if (!state->fc_regions)
1628                        return -ENOMEM;
1629        }
1630        region = &state->fc_regions[state->fc_regions_used++];
1631        region->ino = ino;
1632        region->lblk = lblk;
1633        region->pblk = pblk;
1634        region->len = len;
1635
1636        return 0;
1637}
1638
1639/* Replay add range tag */
1640static int ext4_fc_replay_add_range(struct super_block *sb,
1641                                    struct ext4_fc_tl *tl, u8 *val)
1642{
1643        struct ext4_fc_add_range fc_add_ex;
1644        struct ext4_extent newex, *ex;
1645        struct inode *inode;
1646        ext4_lblk_t start, cur;
1647        int remaining, len;
1648        ext4_fsblk_t start_pblk;
1649        struct ext4_map_blocks map;
1650        struct ext4_ext_path *path = NULL;
1651        int ret;
1652
1653        memcpy(&fc_add_ex, val, sizeof(fc_add_ex));
1654        ex = (struct ext4_extent *)&fc_add_ex.fc_ex;
1655
1656        trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1657                le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block),
1658                ext4_ext_get_actual_len(ex));
1659
1660        inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
1661        if (IS_ERR(inode)) {
1662                jbd_debug(1, "Inode not found.");
1663                return 0;
1664        }
1665
1666        ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1667
1668        start = le32_to_cpu(ex->ee_block);
1669        start_pblk = ext4_ext_pblock(ex);
1670        len = ext4_ext_get_actual_len(ex);
1671
1672        cur = start;
1673        remaining = len;
1674        jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1675                  start, start_pblk, len, ext4_ext_is_unwritten(ex),
1676                  inode->i_ino);
1677
1678        while (remaining > 0) {
1679                map.m_lblk = cur;
1680                map.m_len = remaining;
1681                map.m_pblk = 0;
1682                ret = ext4_map_blocks(NULL, inode, &map, 0);
1683
1684                if (ret < 0) {
1685                        iput(inode);
1686                        return 0;
1687                }
1688
1689                if (ret == 0) {
1690                        /* Range is not mapped */
1691                        path = ext4_find_extent(inode, cur, NULL, 0);
1692                        if (IS_ERR(path)) {
1693                                iput(inode);
1694                                return 0;
1695                        }
1696                        memset(&newex, 0, sizeof(newex));
1697                        newex.ee_block = cpu_to_le32(cur);
1698                        ext4_ext_store_pblock(
1699                                &newex, start_pblk + cur - start);
1700                        newex.ee_len = cpu_to_le16(map.m_len);
1701                        if (ext4_ext_is_unwritten(ex))
1702                                ext4_ext_mark_unwritten(&newex);
1703                        down_write(&EXT4_I(inode)->i_data_sem);
1704                        ret = ext4_ext_insert_extent(
1705                                NULL, inode, &path, &newex, 0);
1706                        up_write((&EXT4_I(inode)->i_data_sem));
1707                        ext4_ext_drop_refs(path);
1708                        kfree(path);
1709                        if (ret) {
1710                                iput(inode);
1711                                return 0;
1712                        }
1713                        goto next;
1714                }
1715
1716                if (start_pblk + cur - start != map.m_pblk) {
1717                        /*
1718                         * Logical to physical mapping changed. This can happen
1719                         * if this range was removed and then reallocated to
1720                         * map to new physical blocks during a fast commit.
1721                         */
1722                        ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1723                                        ext4_ext_is_unwritten(ex),
1724                                        start_pblk + cur - start);
1725                        if (ret) {
1726                                iput(inode);
1727                                return 0;
1728                        }
1729                        /*
1730                         * Mark the old blocks as free since they aren't used
1731                         * anymore. We maintain an array of all the modified
1732                         * inodes. In case these blocks are still used at either
1733                         * a different logical range in the same inode or in
1734                         * some different inode, we will mark them as allocated
1735                         * at the end of the FC replay using our array of
1736                         * modified inodes.
1737                         */
1738                        ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1739                        goto next;
1740                }
1741
1742                /* Range is mapped and needs a state change */
1743                jbd_debug(1, "Converting from %ld to %d %lld",
1744                                map.m_flags & EXT4_MAP_UNWRITTEN,
1745                        ext4_ext_is_unwritten(ex), map.m_pblk);
1746                ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1747                                        ext4_ext_is_unwritten(ex), map.m_pblk);
1748                if (ret) {
1749                        iput(inode);
1750                        return 0;
1751                }
1752                /*
1753                 * We may have split the extent tree while toggling the state.
1754                 * Try to shrink the extent tree now.
1755                 */
1756                ext4_ext_replay_shrink_inode(inode, start + len);
1757next:
1758                cur += map.m_len;
1759                remaining -= map.m_len;
1760        }
1761        ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1762                                        sb->s_blocksize_bits);
1763        iput(inode);
1764        return 0;
1765}
1766
1767/* Replay DEL_RANGE tag */
1768static int
1769ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl,
1770                         u8 *val)
1771{
1772        struct inode *inode;
1773        struct ext4_fc_del_range lrange;
1774        struct ext4_map_blocks map;
1775        ext4_lblk_t cur, remaining;
1776        int ret;
1777
1778        memcpy(&lrange, val, sizeof(lrange));
1779        cur = le32_to_cpu(lrange.fc_lblk);
1780        remaining = le32_to_cpu(lrange.fc_len);
1781
1782        trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1783                le32_to_cpu(lrange.fc_ino), cur, remaining);
1784
1785        inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
1786        if (IS_ERR(inode)) {
1787                jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange.fc_ino));
1788                return 0;
1789        }
1790
1791        ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1792
1793        jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
1794                        inode->i_ino, le32_to_cpu(lrange.fc_lblk),
1795                        le32_to_cpu(lrange.fc_len));
1796        while (remaining > 0) {
1797                map.m_lblk = cur;
1798                map.m_len = remaining;
1799
1800                ret = ext4_map_blocks(NULL, inode, &map, 0);
1801                if (ret < 0) {
1802                        iput(inode);
1803                        return 0;
1804                }
1805                if (ret > 0) {
1806                        remaining -= ret;
1807                        cur += ret;
1808                        ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1809                } else {
1810                        remaining -= map.m_len;
1811                        cur += map.m_len;
1812                }
1813        }
1814
1815        ret = ext4_punch_hole(inode,
1816                le32_to_cpu(lrange.fc_lblk) << sb->s_blocksize_bits,
1817                le32_to_cpu(lrange.fc_len) <<  sb->s_blocksize_bits);
1818        if (ret)
1819                jbd_debug(1, "ext4_punch_hole returned %d", ret);
1820        ext4_ext_replay_shrink_inode(inode,
1821                i_size_read(inode) >> sb->s_blocksize_bits);
1822        ext4_mark_inode_dirty(NULL, inode);
1823        iput(inode);
1824
1825        return 0;
1826}
1827
1828static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1829{
1830        struct ext4_fc_replay_state *state;
1831        struct inode *inode;
1832        struct ext4_ext_path *path = NULL;
1833        struct ext4_map_blocks map;
1834        int i, ret, j;
1835        ext4_lblk_t cur, end;
1836
1837        state = &EXT4_SB(sb)->s_fc_replay_state;
1838        for (i = 0; i < state->fc_modified_inodes_used; i++) {
1839                inode = ext4_iget(sb, state->fc_modified_inodes[i],
1840                        EXT4_IGET_NORMAL);
1841                if (IS_ERR(inode)) {
1842                        jbd_debug(1, "Inode %d not found.",
1843                                state->fc_modified_inodes[i]);
1844                        continue;
1845                }
1846                cur = 0;
1847                end = EXT_MAX_BLOCKS;
1848                if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) {
1849                        iput(inode);
1850                        continue;
1851                }
1852                while (cur < end) {
1853                        map.m_lblk = cur;
1854                        map.m_len = end - cur;
1855
1856                        ret = ext4_map_blocks(NULL, inode, &map, 0);
1857                        if (ret < 0)
1858                                break;
1859
1860                        if (ret > 0) {
1861                                path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1862                                if (!IS_ERR(path)) {
1863                                        for (j = 0; j < path->p_depth; j++)
1864                                                ext4_mb_mark_bb(inode->i_sb,
1865                                                        path[j].p_block, 1, 1);
1866                                        ext4_ext_drop_refs(path);
1867                                        kfree(path);
1868                                }
1869                                cur += ret;
1870                                ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1871                                                        map.m_len, 1);
1872                        } else {
1873                                cur = cur + (map.m_len ? map.m_len : 1);
1874                        }
1875                }
1876                iput(inode);
1877        }
1878}
1879
1880/*
1881 * Check if block is in excluded regions for block allocation. The simple
1882 * allocator that runs during replay phase is calls this function to see
1883 * if it is okay to use a block.
1884 */
1885bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1886{
1887        int i;
1888        struct ext4_fc_replay_state *state;
1889
1890        state = &EXT4_SB(sb)->s_fc_replay_state;
1891        for (i = 0; i < state->fc_regions_valid; i++) {
1892                if (state->fc_regions[i].ino == 0 ||
1893                        state->fc_regions[i].len == 0)
1894                        continue;
1895                if (blk >= state->fc_regions[i].pblk &&
1896                    blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
1897                        return true;
1898        }
1899        return false;
1900}
1901
1902/* Cleanup function called after replay */
1903void ext4_fc_replay_cleanup(struct super_block *sb)
1904{
1905        struct ext4_sb_info *sbi = EXT4_SB(sb);
1906
1907        sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1908        kfree(sbi->s_fc_replay_state.fc_regions);
1909        kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1910}
1911
1912/*
1913 * Recovery Scan phase handler
1914 *
1915 * This function is called during the scan phase and is responsible
1916 * for doing following things:
1917 * - Make sure the fast commit area has valid tags for replay
1918 * - Count number of tags that need to be replayed by the replay handler
1919 * - Verify CRC
1920 * - Create a list of excluded blocks for allocation during replay phase
1921 *
1922 * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1923 * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1924 * to indicate that scan has finished and JBD2 can now start replay phase.
1925 * It returns a negative error to indicate that there was an error. At the end
1926 * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1927 * to indicate the number of tags that need to replayed during the replay phase.
1928 */
1929static int ext4_fc_replay_scan(journal_t *journal,
1930                                struct buffer_head *bh, int off,
1931                                tid_t expected_tid)
1932{
1933        struct super_block *sb = journal->j_private;
1934        struct ext4_sb_info *sbi = EXT4_SB(sb);
1935        struct ext4_fc_replay_state *state;
1936        int ret = JBD2_FC_REPLAY_CONTINUE;
1937        struct ext4_fc_add_range ext;
1938        struct ext4_fc_tl tl;
1939        struct ext4_fc_tail tail;
1940        __u8 *start, *end, *cur, *val;
1941        struct ext4_fc_head head;
1942        struct ext4_extent *ex;
1943
1944        state = &sbi->s_fc_replay_state;
1945
1946        start = (u8 *)bh->b_data;
1947        end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
1948
1949        if (state->fc_replay_expected_off == 0) {
1950                state->fc_cur_tag = 0;
1951                state->fc_replay_num_tags = 0;
1952                state->fc_crc = 0;
1953                state->fc_regions = NULL;
1954                state->fc_regions_valid = state->fc_regions_used =
1955                        state->fc_regions_size = 0;
1956                /* Check if we can stop early */
1957                if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
1958                        != EXT4_FC_TAG_HEAD)
1959                        return 0;
1960        }
1961
1962        if (off != state->fc_replay_expected_off) {
1963                ret = -EFSCORRUPTED;
1964                goto out_err;
1965        }
1966
1967        state->fc_replay_expected_off++;
1968        for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
1969                memcpy(&tl, cur, sizeof(tl));
1970                val = cur + sizeof(tl);
1971                jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
1972                          tag2str(le16_to_cpu(tl.fc_tag)), bh->b_blocknr);
1973                switch (le16_to_cpu(tl.fc_tag)) {
1974                case EXT4_FC_TAG_ADD_RANGE:
1975                        memcpy(&ext, val, sizeof(ext));
1976                        ex = (struct ext4_extent *)&ext.fc_ex;
1977                        ret = ext4_fc_record_regions(sb,
1978                                le32_to_cpu(ext.fc_ino),
1979                                le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
1980                                ext4_ext_get_actual_len(ex));
1981                        if (ret < 0)
1982                                break;
1983                        ret = JBD2_FC_REPLAY_CONTINUE;
1984                        fallthrough;
1985                case EXT4_FC_TAG_DEL_RANGE:
1986                case EXT4_FC_TAG_LINK:
1987                case EXT4_FC_TAG_UNLINK:
1988                case EXT4_FC_TAG_CREAT:
1989                case EXT4_FC_TAG_INODE:
1990                case EXT4_FC_TAG_PAD:
1991                        state->fc_cur_tag++;
1992                        state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
1993                                        sizeof(tl) + le16_to_cpu(tl.fc_len));
1994                        break;
1995                case EXT4_FC_TAG_TAIL:
1996                        state->fc_cur_tag++;
1997                        memcpy(&tail, val, sizeof(tail));
1998                        state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
1999                                                sizeof(tl) +
2000                                                offsetof(struct ext4_fc_tail,
2001                                                fc_crc));
2002                        if (le32_to_cpu(tail.fc_tid) == expected_tid &&
2003                                le32_to_cpu(tail.fc_crc) == state->fc_crc) {
2004                                state->fc_replay_num_tags = state->fc_cur_tag;
2005                                state->fc_regions_valid =
2006                                        state->fc_regions_used;
2007                        } else {
2008                                ret = state->fc_replay_num_tags ?
2009                                        JBD2_FC_REPLAY_STOP : -EFSBADCRC;
2010                        }
2011                        state->fc_crc = 0;
2012                        break;
2013                case EXT4_FC_TAG_HEAD:
2014                        memcpy(&head, val, sizeof(head));
2015                        if (le32_to_cpu(head.fc_features) &
2016                                ~EXT4_FC_SUPPORTED_FEATURES) {
2017                                ret = -EOPNOTSUPP;
2018                                break;
2019                        }
2020                        if (le32_to_cpu(head.fc_tid) != expected_tid) {
2021                                ret = JBD2_FC_REPLAY_STOP;
2022                                break;
2023                        }
2024                        state->fc_cur_tag++;
2025                        state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2026                                            sizeof(tl) + le16_to_cpu(tl.fc_len));
2027                        break;
2028                default:
2029                        ret = state->fc_replay_num_tags ?
2030                                JBD2_FC_REPLAY_STOP : -ECANCELED;
2031                }
2032                if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
2033                        break;
2034        }
2035
2036out_err:
2037        trace_ext4_fc_replay_scan(sb, ret, off);
2038        return ret;
2039}
2040
2041/*
2042 * Main recovery path entry point.
2043 * The meaning of return codes is similar as above.
2044 */
2045static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2046                                enum passtype pass, int off, tid_t expected_tid)
2047{
2048        struct super_block *sb = journal->j_private;
2049        struct ext4_sb_info *sbi = EXT4_SB(sb);
2050        struct ext4_fc_tl tl;
2051        __u8 *start, *end, *cur, *val;
2052        int ret = JBD2_FC_REPLAY_CONTINUE;
2053        struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2054        struct ext4_fc_tail tail;
2055
2056        if (pass == PASS_SCAN) {
2057                state->fc_current_pass = PASS_SCAN;
2058                return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2059        }
2060
2061        if (state->fc_current_pass != pass) {
2062                state->fc_current_pass = pass;
2063                sbi->s_mount_state |= EXT4_FC_REPLAY;
2064        }
2065        if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2066                jbd_debug(1, "Replay stops\n");
2067                ext4_fc_set_bitmaps_and_counters(sb);
2068                return 0;
2069        }
2070
2071#ifdef CONFIG_EXT4_DEBUG
2072        if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2073                pr_warn("Dropping fc block %d because max_replay set\n", off);
2074                return JBD2_FC_REPLAY_STOP;
2075        }
2076#endif
2077
2078        start = (u8 *)bh->b_data;
2079        end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2080
2081        for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
2082                memcpy(&tl, cur, sizeof(tl));
2083                val = cur + sizeof(tl);
2084
2085                if (state->fc_replay_num_tags == 0) {
2086                        ret = JBD2_FC_REPLAY_STOP;
2087                        ext4_fc_set_bitmaps_and_counters(sb);
2088                        break;
2089                }
2090                jbd_debug(3, "Replay phase, tag:%s\n",
2091                                tag2str(le16_to_cpu(tl.fc_tag)));
2092                state->fc_replay_num_tags--;
2093                switch (le16_to_cpu(tl.fc_tag)) {
2094                case EXT4_FC_TAG_LINK:
2095                        ret = ext4_fc_replay_link(sb, &tl, val);
2096                        break;
2097                case EXT4_FC_TAG_UNLINK:
2098                        ret = ext4_fc_replay_unlink(sb, &tl, val);
2099                        break;
2100                case EXT4_FC_TAG_ADD_RANGE:
2101                        ret = ext4_fc_replay_add_range(sb, &tl, val);
2102                        break;
2103                case EXT4_FC_TAG_CREAT:
2104                        ret = ext4_fc_replay_create(sb, &tl, val);
2105                        break;
2106                case EXT4_FC_TAG_DEL_RANGE:
2107                        ret = ext4_fc_replay_del_range(sb, &tl, val);
2108                        break;
2109                case EXT4_FC_TAG_INODE:
2110                        ret = ext4_fc_replay_inode(sb, &tl, val);
2111                        break;
2112                case EXT4_FC_TAG_PAD:
2113                        trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2114                                             le16_to_cpu(tl.fc_len), 0);
2115                        break;
2116                case EXT4_FC_TAG_TAIL:
2117                        trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
2118                                             le16_to_cpu(tl.fc_len), 0);
2119                        memcpy(&tail, val, sizeof(tail));
2120                        WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
2121                        break;
2122                case EXT4_FC_TAG_HEAD:
2123                        break;
2124                default:
2125                        trace_ext4_fc_replay(sb, le16_to_cpu(tl.fc_tag), 0,
2126                                             le16_to_cpu(tl.fc_len), 0);
2127                        ret = -ECANCELED;
2128                        break;
2129                }
2130                if (ret < 0)
2131                        break;
2132                ret = JBD2_FC_REPLAY_CONTINUE;
2133        }
2134        return ret;
2135}
2136
2137void ext4_fc_init(struct super_block *sb, journal_t *journal)
2138{
2139        /*
2140         * We set replay callback even if fast commit disabled because we may
2141         * could still have fast commit blocks that need to be replayed even if
2142         * fast commit has now been turned off.
2143         */
2144        journal->j_fc_replay_callback = ext4_fc_replay;
2145        if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2146                return;
2147        journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2148}
2149
2150static const char *fc_ineligible_reasons[] = {
2151        "Extended attributes changed",
2152        "Cross rename",
2153        "Journal flag changed",
2154        "Insufficient memory",
2155        "Swap boot",
2156        "Resize",
2157        "Dir renamed",
2158        "Falloc range op",
2159        "Data journalling",
2160        "FC Commit Failed"
2161};
2162
2163int ext4_fc_info_show(struct seq_file *seq, void *v)
2164{
2165        struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2166        struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2167        int i;
2168
2169        if (v != SEQ_START_TOKEN)
2170                return 0;
2171
2172        seq_printf(seq,
2173                "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2174                   stats->fc_num_commits, stats->fc_ineligible_commits,
2175                   stats->fc_numblks,
2176                   div_u64(sbi->s_fc_avg_commit_time, 1000));
2177        seq_puts(seq, "Ineligible reasons:\n");
2178        for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2179                seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2180                        stats->fc_ineligible_reason_count[i]);
2181
2182        return 0;
2183}
2184
2185int __init ext4_fc_init_dentry_cache(void)
2186{
2187        ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2188                                           SLAB_RECLAIM_ACCOUNT);
2189
2190        if (ext4_fc_dentry_cachep == NULL)
2191                return -ENOMEM;
2192
2193        return 0;
2194}
2195