linux/fs/jbd2/checkpoint.c
<<
>>
Prefs
   1/*
   2 * linux/fs/jbd2/checkpoint.c
   3 *
   4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
   5 *
   6 * Copyright 1999 Red Hat Software --- All Rights Reserved
   7 *
   8 * This file is part of the Linux kernel and is made available under
   9 * the terms of the GNU General Public License, version 2, or at your
  10 * option, any later version, incorporated herein by reference.
  11 *
  12 * Checkpoint routines for the generic filesystem journaling code.
  13 * Part of the ext2fs journaling system.
  14 *
  15 * Checkpointing is the process of ensuring that a section of the log is
  16 * committed fully to disk, so that that portion of the log can be
  17 * reused.
  18 */
  19
  20#include <linux/time.h>
  21#include <linux/fs.h>
  22#include <linux/jbd2.h>
  23#include <linux/errno.h>
  24#include <linux/slab.h>
  25#include <linux/blkdev.h>
  26#include <trace/events/jbd2.h>
  27
  28/*
  29 * Unlink a buffer from a transaction checkpoint list.
  30 *
  31 * Called with j_list_lock held.
  32 */
  33static inline void __buffer_unlink_first(struct journal_head *jh)
  34{
  35        transaction_t *transaction = jh->b_cp_transaction;
  36
  37        jh->b_cpnext->b_cpprev = jh->b_cpprev;
  38        jh->b_cpprev->b_cpnext = jh->b_cpnext;
  39        if (transaction->t_checkpoint_list == jh) {
  40                transaction->t_checkpoint_list = jh->b_cpnext;
  41                if (transaction->t_checkpoint_list == jh)
  42                        transaction->t_checkpoint_list = NULL;
  43        }
  44}
  45
  46/*
  47 * Unlink a buffer from a transaction checkpoint(io) list.
  48 *
  49 * Called with j_list_lock held.
  50 */
  51static inline void __buffer_unlink(struct journal_head *jh)
  52{
  53        transaction_t *transaction = jh->b_cp_transaction;
  54
  55        __buffer_unlink_first(jh);
  56        if (transaction->t_checkpoint_io_list == jh) {
  57                transaction->t_checkpoint_io_list = jh->b_cpnext;
  58                if (transaction->t_checkpoint_io_list == jh)
  59                        transaction->t_checkpoint_io_list = NULL;
  60        }
  61}
  62
  63/*
  64 * Move a buffer from the checkpoint list to the checkpoint io list
  65 *
  66 * Called with j_list_lock held
  67 */
  68static inline void __buffer_relink_io(struct journal_head *jh)
  69{
  70        transaction_t *transaction = jh->b_cp_transaction;
  71
  72        __buffer_unlink_first(jh);
  73
  74        if (!transaction->t_checkpoint_io_list) {
  75                jh->b_cpnext = jh->b_cpprev = jh;
  76        } else {
  77                jh->b_cpnext = transaction->t_checkpoint_io_list;
  78                jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev;
  79                jh->b_cpprev->b_cpnext = jh;
  80                jh->b_cpnext->b_cpprev = jh;
  81        }
  82        transaction->t_checkpoint_io_list = jh;
  83}
  84
  85/*
  86 * Try to release a checkpointed buffer from its transaction.
  87 * Returns 1 if we released it and 2 if we also released the
  88 * whole transaction.
  89 *
  90 * Requires j_list_lock
  91 */
  92static int __try_to_free_cp_buf(struct journal_head *jh)
  93{
  94        int ret = 0;
  95        struct buffer_head *bh = jh2bh(jh);
  96
  97        if (jh->b_transaction == NULL && !buffer_locked(bh) &&
  98            !buffer_dirty(bh) && !buffer_write_io_error(bh)) {
  99                JBUFFER_TRACE(jh, "remove from checkpoint list");
 100                ret = __jbd2_journal_remove_checkpoint(jh) + 1;
 101        }
 102        return ret;
 103}
 104
 105/*
 106 * __jbd2_log_wait_for_space: wait until there is space in the journal.
 107 *
 108 * Called under j-state_lock *only*.  It will be unlocked if we have to wait
 109 * for a checkpoint to free up some space in the log.
 110 */
 111void __jbd2_log_wait_for_space(journal_t *journal)
 112{
 113        int nblocks, space_left;
 114        /* assert_spin_locked(&journal->j_state_lock); */
 115
 116        nblocks = jbd2_space_needed(journal);
 117        while (jbd2_log_space_left(journal) < nblocks) {
 118                write_unlock(&journal->j_state_lock);
 119                mutex_lock(&journal->j_checkpoint_mutex);
 120
 121                /*
 122                 * Test again, another process may have checkpointed while we
 123                 * were waiting for the checkpoint lock. If there are no
 124                 * transactions ready to be checkpointed, try to recover
 125                 * journal space by calling cleanup_journal_tail(), and if
 126                 * that doesn't work, by waiting for the currently committing
 127                 * transaction to complete.  If there is absolutely no way
 128                 * to make progress, this is either a BUG or corrupted
 129                 * filesystem, so abort the journal and leave a stack
 130                 * trace for forensic evidence.
 131                 */
 132                write_lock(&journal->j_state_lock);
 133                if (journal->j_flags & JBD2_ABORT) {
 134                        mutex_unlock(&journal->j_checkpoint_mutex);
 135                        return;
 136                }
 137                spin_lock(&journal->j_list_lock);
 138                nblocks = jbd2_space_needed(journal);
 139                space_left = jbd2_log_space_left(journal);
 140                if (space_left < nblocks) {
 141                        int chkpt = journal->j_checkpoint_transactions != NULL;
 142                        tid_t tid = 0;
 143
 144                        if (journal->j_committing_transaction)
 145                                tid = journal->j_committing_transaction->t_tid;
 146                        spin_unlock(&journal->j_list_lock);
 147                        write_unlock(&journal->j_state_lock);
 148                        if (chkpt) {
 149                                jbd2_log_do_checkpoint(journal);
 150                        } else if (jbd2_cleanup_journal_tail(journal) == 0) {
 151                                /* We were able to recover space; yay! */
 152                                ;
 153                        } else if (tid) {
 154                                /*
 155                                 * jbd2_journal_commit_transaction() may want
 156                                 * to take the checkpoint_mutex if JBD2_FLUSHED
 157                                 * is set.  So we need to temporarily drop it.
 158                                 */
 159                                mutex_unlock(&journal->j_checkpoint_mutex);
 160                                jbd2_log_wait_commit(journal, tid);
 161                                write_lock(&journal->j_state_lock);
 162                                continue;
 163                        } else {
 164                                printk(KERN_ERR "%s: needed %d blocks and "
 165                                       "only had %d space available\n",
 166                                       __func__, nblocks, space_left);
 167                                printk(KERN_ERR "%s: no way to get more "
 168                                       "journal space in %s\n", __func__,
 169                                       journal->j_devname);
 170                                WARN_ON(1);
 171                                jbd2_journal_abort(journal, 0);
 172                        }
 173                        write_lock(&journal->j_state_lock);
 174                } else {
 175                        spin_unlock(&journal->j_list_lock);
 176                }
 177                mutex_unlock(&journal->j_checkpoint_mutex);
 178        }
 179}
 180
 181static void
 182__flush_batch(journal_t *journal, int *batch_count)
 183{
 184        int i;
 185        struct blk_plug plug;
 186
 187        blk_start_plug(&plug);
 188        for (i = 0; i < *batch_count; i++)
 189                write_dirty_buffer(journal->j_chkpt_bhs[i], WRITE_SYNC);
 190        blk_finish_plug(&plug);
 191
 192        for (i = 0; i < *batch_count; i++) {
 193                struct buffer_head *bh = journal->j_chkpt_bhs[i];
 194                BUFFER_TRACE(bh, "brelse");
 195                __brelse(bh);
 196        }
 197        *batch_count = 0;
 198}
 199
 200/*
 201 * Perform an actual checkpoint. We take the first transaction on the
 202 * list of transactions to be checkpointed and send all its buffers
 203 * to disk. We submit larger chunks of data at once.
 204 *
 205 * The journal should be locked before calling this function.
 206 * Called with j_checkpoint_mutex held.
 207 */
 208int jbd2_log_do_checkpoint(journal_t *journal)
 209{
 210        struct journal_head     *jh;
 211        struct buffer_head      *bh;
 212        transaction_t           *transaction;
 213        tid_t                   this_tid;
 214        int                     result, batch_count = 0;
 215
 216        jbd_debug(1, "Start checkpoint\n");
 217
 218        /*
 219         * First thing: if there are any transactions in the log which
 220         * don't need checkpointing, just eliminate them from the
 221         * journal straight away.
 222         */
 223        result = jbd2_cleanup_journal_tail(journal);
 224        trace_jbd2_checkpoint(journal, result);
 225        jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
 226        if (result <= 0)
 227                return result;
 228
 229        /*
 230         * OK, we need to start writing disk blocks.  Take one transaction
 231         * and write it.
 232         */
 233        result = 0;
 234        spin_lock(&journal->j_list_lock);
 235        if (!journal->j_checkpoint_transactions)
 236                goto out;
 237        transaction = journal->j_checkpoint_transactions;
 238        if (transaction->t_chp_stats.cs_chp_time == 0)
 239                transaction->t_chp_stats.cs_chp_time = jiffies;
 240        this_tid = transaction->t_tid;
 241restart:
 242        /*
 243         * If someone cleaned up this transaction while we slept, we're
 244         * done (maybe it's a new transaction, but it fell at the same
 245         * address).
 246         */
 247        if (journal->j_checkpoint_transactions != transaction ||
 248            transaction->t_tid != this_tid)
 249                goto out;
 250
 251        /* checkpoint all of the transaction's buffers */
 252        while (transaction->t_checkpoint_list) {
 253                jh = transaction->t_checkpoint_list;
 254                bh = jh2bh(jh);
 255
 256                if (buffer_locked(bh)) {
 257                        spin_unlock(&journal->j_list_lock);
 258                        get_bh(bh);
 259                        wait_on_buffer(bh);
 260                        /* the journal_head may have gone by now */
 261                        BUFFER_TRACE(bh, "brelse");
 262                        __brelse(bh);
 263                        goto retry;
 264                }
 265                if (jh->b_transaction != NULL) {
 266                        transaction_t *t = jh->b_transaction;
 267                        tid_t tid = t->t_tid;
 268
 269                        transaction->t_chp_stats.cs_forced_to_close++;
 270                        spin_unlock(&journal->j_list_lock);
 271                        if (unlikely(journal->j_flags & JBD2_UNMOUNT))
 272                                /*
 273                                 * The journal thread is dead; so
 274                                 * starting and waiting for a commit
 275                                 * to finish will cause us to wait for
 276                                 * a _very_ long time.
 277                                 */
 278                                printk(KERN_ERR
 279                "JBD2: %s: Waiting for Godot: block %llu\n",
 280                journal->j_devname, (unsigned long long) bh->b_blocknr);
 281
 282                        jbd2_log_start_commit(journal, tid);
 283                        jbd2_log_wait_commit(journal, tid);
 284                        goto retry;
 285                }
 286                if (!buffer_dirty(bh)) {
 287                        if (unlikely(buffer_write_io_error(bh)) && !result)
 288                                result = -EIO;
 289                        BUFFER_TRACE(bh, "remove from checkpoint");
 290                        if (__jbd2_journal_remove_checkpoint(jh))
 291                                /* The transaction was released; we're done */
 292                                goto out;
 293                        continue;
 294                }
 295                /*
 296                 * Important: we are about to write the buffer, and
 297                 * possibly block, while still holding the journal
 298                 * lock.  We cannot afford to let the transaction
 299                 * logic start messing around with this buffer before
 300                 * we write it to disk, as that would break
 301                 * recoverability.
 302                 */
 303                BUFFER_TRACE(bh, "queue");
 304                get_bh(bh);
 305                J_ASSERT_BH(bh, !buffer_jwrite(bh));
 306                journal->j_chkpt_bhs[batch_count++] = bh;
 307                __buffer_relink_io(jh);
 308                transaction->t_chp_stats.cs_written++;
 309                if ((batch_count == JBD2_NR_BATCH) ||
 310                    need_resched() ||
 311                    spin_needbreak(&journal->j_list_lock))
 312                        goto unlock_and_flush;
 313        }
 314
 315        if (batch_count) {
 316                unlock_and_flush:
 317                        spin_unlock(&journal->j_list_lock);
 318                retry:
 319                        if (batch_count)
 320                                __flush_batch(journal, &batch_count);
 321                        spin_lock(&journal->j_list_lock);
 322                        goto restart;
 323        }
 324
 325        /*
 326         * Now we issued all of the transaction's buffers, let's deal
 327         * with the buffers that are out for I/O.
 328         */
 329restart2:
 330        /* Did somebody clean up the transaction in the meanwhile? */
 331        if (journal->j_checkpoint_transactions != transaction ||
 332            transaction->t_tid != this_tid)
 333                goto out;
 334
 335        while (transaction->t_checkpoint_io_list) {
 336                jh = transaction->t_checkpoint_io_list;
 337                bh = jh2bh(jh);
 338                if (buffer_locked(bh)) {
 339                        spin_unlock(&journal->j_list_lock);
 340                        get_bh(bh);
 341                        wait_on_buffer(bh);
 342                        /* the journal_head may have gone by now */
 343                        BUFFER_TRACE(bh, "brelse");
 344                        __brelse(bh);
 345                        spin_lock(&journal->j_list_lock);
 346                        goto restart2;
 347                }
 348                if (unlikely(buffer_write_io_error(bh)) && !result)
 349                        result = -EIO;
 350
 351                /*
 352                 * Now in whatever state the buffer currently is, we
 353                 * know that it has been written out and so we can
 354                 * drop it from the list
 355                 */
 356                if (__jbd2_journal_remove_checkpoint(jh))
 357                        break;
 358        }
 359out:
 360        spin_unlock(&journal->j_list_lock);
 361        if (result < 0)
 362                jbd2_journal_abort(journal, result);
 363        else
 364                result = jbd2_cleanup_journal_tail(journal);
 365
 366        return (result < 0) ? result : 0;
 367}
 368
 369/*
 370 * Check the list of checkpoint transactions for the journal to see if
 371 * we have already got rid of any since the last update of the log tail
 372 * in the journal superblock.  If so, we can instantly roll the
 373 * superblock forward to remove those transactions from the log.
 374 *
 375 * Return <0 on error, 0 on success, 1 if there was nothing to clean up.
 376 *
 377 * Called with the journal lock held.
 378 *
 379 * This is the only part of the journaling code which really needs to be
 380 * aware of transaction aborts.  Checkpointing involves writing to the
 381 * main filesystem area rather than to the journal, so it can proceed
 382 * even in abort state, but we must not update the super block if
 383 * checkpointing may have failed.  Otherwise, we would lose some metadata
 384 * buffers which should be written-back to the filesystem.
 385 */
 386
 387int jbd2_cleanup_journal_tail(journal_t *journal)
 388{
 389        tid_t           first_tid;
 390        unsigned long   blocknr;
 391
 392        if (is_journal_aborted(journal))
 393                return 1;
 394
 395        if (!jbd2_journal_get_log_tail(journal, &first_tid, &blocknr))
 396                return 1;
 397        J_ASSERT(blocknr != 0);
 398
 399        /*
 400         * We need to make sure that any blocks that were recently written out
 401         * --- perhaps by jbd2_log_do_checkpoint() --- are flushed out before
 402         * we drop the transactions from the journal. It's unlikely this will
 403         * be necessary, especially with an appropriately sized journal, but we
 404         * need this to guarantee correctness.  Fortunately
 405         * jbd2_cleanup_journal_tail() doesn't get called all that often.
 406         */
 407        if (journal->j_flags & JBD2_BARRIER)
 408                blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
 409
 410        __jbd2_update_log_tail(journal, first_tid, blocknr);
 411        return 0;
 412}
 413
 414
 415/* Checkpoint list management */
 416
 417/*
 418 * journal_clean_one_cp_list
 419 *
 420 * Find all the written-back checkpoint buffers in the given list and
 421 * release them.
 422 *
 423 * Called with j_list_lock held.
 424 * Returns 1 if we freed the transaction, 0 otherwise.
 425 */
 426static int journal_clean_one_cp_list(struct journal_head *jh)
 427{
 428        struct journal_head *last_jh;
 429        struct journal_head *next_jh = jh;
 430        int ret;
 431        int freed = 0;
 432
 433        if (!jh)
 434                return 0;
 435
 436        last_jh = jh->b_cpprev;
 437        do {
 438                jh = next_jh;
 439                next_jh = jh->b_cpnext;
 440                ret = __try_to_free_cp_buf(jh);
 441                if (!ret)
 442                        return freed;
 443                if (ret == 2)
 444                        return 1;
 445                freed = 1;
 446                /*
 447                 * This function only frees up some memory
 448                 * if possible so we dont have an obligation
 449                 * to finish processing. Bail out if preemption
 450                 * requested:
 451                 */
 452                if (need_resched())
 453                        return freed;
 454        } while (jh != last_jh);
 455
 456        return freed;
 457}
 458
 459/*
 460 * journal_clean_checkpoint_list
 461 *
 462 * Find all the written-back checkpoint buffers in the journal and release them.
 463 *
 464 * Called with j_list_lock held.
 465 */
 466void __jbd2_journal_clean_checkpoint_list(journal_t *journal)
 467{
 468        transaction_t *transaction, *last_transaction, *next_transaction;
 469        int ret;
 470
 471        transaction = journal->j_checkpoint_transactions;
 472        if (!transaction)
 473                return;
 474
 475        last_transaction = transaction->t_cpprev;
 476        next_transaction = transaction;
 477        do {
 478                transaction = next_transaction;
 479                next_transaction = transaction->t_cpnext;
 480                ret = journal_clean_one_cp_list(transaction->t_checkpoint_list);
 481                /*
 482                 * This function only frees up some memory if possible so we
 483                 * dont have an obligation to finish processing. Bail out if
 484                 * preemption requested:
 485                 */
 486                if (need_resched())
 487                        return;
 488                if (ret)
 489                        continue;
 490                /*
 491                 * It is essential that we are as careful as in the case of
 492                 * t_checkpoint_list with removing the buffer from the list as
 493                 * we can possibly see not yet submitted buffers on io_list
 494                 */
 495                ret = journal_clean_one_cp_list(transaction->
 496                                t_checkpoint_io_list);
 497                if (need_resched())
 498                        return;
 499                /*
 500                 * Stop scanning if we couldn't free the transaction. This
 501                 * avoids pointless scanning of transactions which still
 502                 * weren't checkpointed.
 503                 */
 504                if (!ret)
 505                        return;
 506        } while (transaction != last_transaction);
 507}
 508
 509/*
 510 * journal_remove_checkpoint: called after a buffer has been committed
 511 * to disk (either by being write-back flushed to disk, or being
 512 * committed to the log).
 513 *
 514 * We cannot safely clean a transaction out of the log until all of the
 515 * buffer updates committed in that transaction have safely been stored
 516 * elsewhere on disk.  To achieve this, all of the buffers in a
 517 * transaction need to be maintained on the transaction's checkpoint
 518 * lists until they have been rewritten, at which point this function is
 519 * called to remove the buffer from the existing transaction's
 520 * checkpoint lists.
 521 *
 522 * The function returns 1 if it frees the transaction, 0 otherwise.
 523 * The function can free jh and bh.
 524 *
 525 * This function is called with j_list_lock held.
 526 */
 527int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
 528{
 529        struct transaction_chp_stats_s *stats;
 530        transaction_t *transaction;
 531        journal_t *journal;
 532        int ret = 0;
 533
 534        JBUFFER_TRACE(jh, "entry");
 535
 536        if ((transaction = jh->b_cp_transaction) == NULL) {
 537                JBUFFER_TRACE(jh, "not on transaction");
 538                goto out;
 539        }
 540        journal = transaction->t_journal;
 541
 542        JBUFFER_TRACE(jh, "removing from transaction");
 543        __buffer_unlink(jh);
 544        jh->b_cp_transaction = NULL;
 545        jbd2_journal_put_journal_head(jh);
 546
 547        if (transaction->t_checkpoint_list != NULL ||
 548            transaction->t_checkpoint_io_list != NULL)
 549                goto out;
 550
 551        /*
 552         * There is one special case to worry about: if we have just pulled the
 553         * buffer off a running or committing transaction's checkpoing list,
 554         * then even if the checkpoint list is empty, the transaction obviously
 555         * cannot be dropped!
 556         *
 557         * The locking here around t_state is a bit sleazy.
 558         * See the comment at the end of jbd2_journal_commit_transaction().
 559         */
 560        if (transaction->t_state != T_FINISHED)
 561                goto out;
 562
 563        /* OK, that was the last buffer for the transaction: we can now
 564           safely remove this transaction from the log */
 565        stats = &transaction->t_chp_stats;
 566        if (stats->cs_chp_time)
 567                stats->cs_chp_time = jbd2_time_diff(stats->cs_chp_time,
 568                                                    jiffies);
 569        trace_jbd2_checkpoint_stats(journal->j_fs_dev->bd_dev,
 570                                    transaction->t_tid, stats);
 571
 572        __jbd2_journal_drop_transaction(journal, transaction);
 573        jbd2_journal_free_transaction(transaction);
 574        ret = 1;
 575out:
 576        return ret;
 577}
 578
 579/*
 580 * journal_insert_checkpoint: put a committed buffer onto a checkpoint
 581 * list so that we know when it is safe to clean the transaction out of
 582 * the log.
 583 *
 584 * Called with the journal locked.
 585 * Called with j_list_lock held.
 586 */
 587void __jbd2_journal_insert_checkpoint(struct journal_head *jh,
 588                               transaction_t *transaction)
 589{
 590        JBUFFER_TRACE(jh, "entry");
 591        J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh)));
 592        J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
 593
 594        /* Get reference for checkpointing transaction */
 595        jbd2_journal_grab_journal_head(jh2bh(jh));
 596        jh->b_cp_transaction = transaction;
 597
 598        if (!transaction->t_checkpoint_list) {
 599                jh->b_cpnext = jh->b_cpprev = jh;
 600        } else {
 601                jh->b_cpnext = transaction->t_checkpoint_list;
 602                jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev;
 603                jh->b_cpprev->b_cpnext = jh;
 604                jh->b_cpnext->b_cpprev = jh;
 605        }
 606        transaction->t_checkpoint_list = jh;
 607}
 608
 609/*
 610 * We've finished with this transaction structure: adios...
 611 *
 612 * The transaction must have no links except for the checkpoint by this
 613 * point.
 614 *
 615 * Called with the journal locked.
 616 * Called with j_list_lock held.
 617 */
 618
 619void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transaction)
 620{
 621        assert_spin_locked(&journal->j_list_lock);
 622        if (transaction->t_cpnext) {
 623                transaction->t_cpnext->t_cpprev = transaction->t_cpprev;
 624                transaction->t_cpprev->t_cpnext = transaction->t_cpnext;
 625                if (journal->j_checkpoint_transactions == transaction)
 626                        journal->j_checkpoint_transactions =
 627                                transaction->t_cpnext;
 628                if (journal->j_checkpoint_transactions == transaction)
 629                        journal->j_checkpoint_transactions = NULL;
 630        }
 631
 632        J_ASSERT(transaction->t_state == T_FINISHED);
 633        J_ASSERT(transaction->t_buffers == NULL);
 634        J_ASSERT(transaction->t_forget == NULL);
 635        J_ASSERT(transaction->t_shadow_list == NULL);
 636        J_ASSERT(transaction->t_checkpoint_list == NULL);
 637        J_ASSERT(transaction->t_checkpoint_io_list == NULL);
 638        J_ASSERT(atomic_read(&transaction->t_updates) == 0);
 639        J_ASSERT(journal->j_committing_transaction != transaction);
 640        J_ASSERT(journal->j_running_transaction != transaction);
 641
 642        trace_jbd2_drop_transaction(journal, transaction);
 643
 644        jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
 645}
 646