linux/fs/jbd/commit.c
<<
>>
Prefs
   1/*
   2 * linux/fs/jbd/commit.c
   3 *
   4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
   5 *
   6 * Copyright 1998 Red Hat corp --- All Rights Reserved
   7 *
   8 * This file is part of the Linux kernel and is made available under
   9 * the terms of the GNU General Public License, version 2, or at your
  10 * option, any later version, incorporated herein by reference.
  11 *
  12 * Journal commit routines for the generic filesystem journaling code;
  13 * part of the ext2fs journaling system.
  14 */
  15
  16#include <linux/time.h>
  17#include <linux/fs.h>
  18#include <linux/jbd.h>
  19#include <linux/errno.h>
  20#include <linux/slab.h>
  21#include <linux/mm.h>
  22#include <linux/pagemap.h>
  23
  24/*
  25 * Default IO end handler for temporary BJ_IO buffer_heads.
  26 */
  27static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
  28{
  29        BUFFER_TRACE(bh, "");
  30        if (uptodate)
  31                set_buffer_uptodate(bh);
  32        else
  33                clear_buffer_uptodate(bh);
  34        unlock_buffer(bh);
  35}
  36
  37/*
  38 * When an ext3-ordered file is truncated, it is possible that many pages are
  39 * not sucessfully freed, because they are attached to a committing transaction.
  40 * After the transaction commits, these pages are left on the LRU, with no
  41 * ->mapping, and with attached buffers.  These pages are trivially reclaimable
  42 * by the VM, but their apparent absence upsets the VM accounting, and it makes
  43 * the numbers in /proc/meminfo look odd.
  44 *
  45 * So here, we have a buffer which has just come off the forget list.  Look to
  46 * see if we can strip all buffers from the backing page.
  47 *
  48 * Called under lock_journal(), and possibly under journal_datalist_lock.  The
  49 * caller provided us with a ref against the buffer, and we drop that here.
  50 */
  51static void release_buffer_page(struct buffer_head *bh)
  52{
  53        struct page *page;
  54
  55        if (buffer_dirty(bh))
  56                goto nope;
  57        if (atomic_read(&bh->b_count) != 1)
  58                goto nope;
  59        page = bh->b_page;
  60        if (!page)
  61                goto nope;
  62        if (page->mapping)
  63                goto nope;
  64
  65        /* OK, it's a truncated page */
  66        if (TestSetPageLocked(page))
  67                goto nope;
  68
  69        page_cache_get(page);
  70        __brelse(bh);
  71        try_to_free_buffers(page);
  72        unlock_page(page);
  73        page_cache_release(page);
  74        return;
  75
  76nope:
  77        __brelse(bh);
  78}
  79
  80/*
  81 * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
  82 * held.  For ranking reasons we must trylock.  If we lose, schedule away and
  83 * return 0.  j_list_lock is dropped in this case.
  84 */
  85static int inverted_lock(journal_t *journal, struct buffer_head *bh)
  86{
  87        if (!jbd_trylock_bh_state(bh)) {
  88                spin_unlock(&journal->j_list_lock);
  89                schedule();
  90                return 0;
  91        }
  92        return 1;
  93}
  94
  95/* Done it all: now write the commit record.  We should have
  96 * cleaned up our previous buffers by now, so if we are in abort
  97 * mode we can now just skip the rest of the journal write
  98 * entirely.
  99 *
 100 * Returns 1 if the journal needs to be aborted or 0 on success
 101 */
 102static int journal_write_commit_record(journal_t *journal,
 103                                        transaction_t *commit_transaction)
 104{
 105        struct journal_head *descriptor;
 106        struct buffer_head *bh;
 107        int i, ret;
 108        int barrier_done = 0;
 109
 110        if (is_journal_aborted(journal))
 111                return 0;
 112
 113        descriptor = journal_get_descriptor_buffer(journal);
 114        if (!descriptor)
 115                return 1;
 116
 117        bh = jh2bh(descriptor);
 118
 119        /* AKPM: buglet - add `i' to tmp! */
 120        for (i = 0; i < bh->b_size; i += 512) {
 121                journal_header_t *tmp = (journal_header_t*)bh->b_data;
 122                tmp->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
 123                tmp->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
 124                tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
 125        }
 126
 127        JBUFFER_TRACE(descriptor, "write commit block");
 128        set_buffer_dirty(bh);
 129        if (journal->j_flags & JFS_BARRIER) {
 130                set_buffer_ordered(bh);
 131                barrier_done = 1;
 132        }
 133        ret = sync_dirty_buffer(bh);
 134        /* is it possible for another commit to fail at roughly
 135         * the same time as this one?  If so, we don't want to
 136         * trust the barrier flag in the super, but instead want
 137         * to remember if we sent a barrier request
 138         */
 139        if (ret == -EOPNOTSUPP && barrier_done) {
 140                char b[BDEVNAME_SIZE];
 141
 142                printk(KERN_WARNING
 143                        "JBD: barrier-based sync failed on %s - "
 144                        "disabling barriers\n",
 145                        bdevname(journal->j_dev, b));
 146                spin_lock(&journal->j_state_lock);
 147                journal->j_flags &= ~JFS_BARRIER;
 148                spin_unlock(&journal->j_state_lock);
 149
 150                /* And try again, without the barrier */
 151                clear_buffer_ordered(bh);
 152                set_buffer_uptodate(bh);
 153                set_buffer_dirty(bh);
 154                ret = sync_dirty_buffer(bh);
 155        }
 156        put_bh(bh);             /* One for getblk() */
 157        journal_put_journal_head(descriptor);
 158
 159        return (ret == -EIO);
 160}
 161
 162static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
 163{
 164        int i;
 165
 166        for (i = 0; i < bufs; i++) {
 167                wbuf[i]->b_end_io = end_buffer_write_sync;
 168                /* We use-up our safety reference in submit_bh() */
 169                submit_bh(WRITE, wbuf[i]);
 170        }
 171}
 172
 173/*
 174 *  Submit all the data buffers to disk
 175 */
 176static void journal_submit_data_buffers(journal_t *journal,
 177                                transaction_t *commit_transaction)
 178{
 179        struct journal_head *jh;
 180        struct buffer_head *bh;
 181        int locked;
 182        int bufs = 0;
 183        struct buffer_head **wbuf = journal->j_wbuf;
 184
 185        /*
 186         * Whenever we unlock the journal and sleep, things can get added
 187         * onto ->t_sync_datalist, so we have to keep looping back to
 188         * write_out_data until we *know* that the list is empty.
 189         *
 190         * Cleanup any flushed data buffers from the data list.  Even in
 191         * abort mode, we want to flush this out as soon as possible.
 192         */
 193write_out_data:
 194        cond_resched();
 195        spin_lock(&journal->j_list_lock);
 196
 197        while (commit_transaction->t_sync_datalist) {
 198                jh = commit_transaction->t_sync_datalist;
 199                bh = jh2bh(jh);
 200                locked = 0;
 201
 202                /* Get reference just to make sure buffer does not disappear
 203                 * when we are forced to drop various locks */
 204                get_bh(bh);
 205                /* If the buffer is dirty, we need to submit IO and hence
 206                 * we need the buffer lock. We try to lock the buffer without
 207                 * blocking. If we fail, we need to drop j_list_lock and do
 208                 * blocking lock_buffer().
 209                 */
 210                if (buffer_dirty(bh)) {
 211                        if (test_set_buffer_locked(bh)) {
 212                                BUFFER_TRACE(bh, "needs blocking lock");
 213                                spin_unlock(&journal->j_list_lock);
 214                                /* Write out all data to prevent deadlocks */
 215                                journal_do_submit_data(wbuf, bufs);
 216                                bufs = 0;
 217                                lock_buffer(bh);
 218                                spin_lock(&journal->j_list_lock);
 219                        }
 220                        locked = 1;
 221                }
 222                /* We have to get bh_state lock. Again out of order, sigh. */
 223                if (!inverted_lock(journal, bh)) {
 224                        jbd_lock_bh_state(bh);
 225                        spin_lock(&journal->j_list_lock);
 226                }
 227                /* Someone already cleaned up the buffer? */
 228                if (!buffer_jbd(bh)
 229                        || jh->b_transaction != commit_transaction
 230                        || jh->b_jlist != BJ_SyncData) {
 231                        jbd_unlock_bh_state(bh);
 232                        if (locked)
 233                                unlock_buffer(bh);
 234                        BUFFER_TRACE(bh, "already cleaned up");
 235                        put_bh(bh);
 236                        continue;
 237                }
 238                if (locked && test_clear_buffer_dirty(bh)) {
 239                        BUFFER_TRACE(bh, "needs writeout, adding to array");
 240                        wbuf[bufs++] = bh;
 241                        __journal_file_buffer(jh, commit_transaction,
 242                                                BJ_Locked);
 243                        jbd_unlock_bh_state(bh);
 244                        if (bufs == journal->j_wbufsize) {
 245                                spin_unlock(&journal->j_list_lock);
 246                                journal_do_submit_data(wbuf, bufs);
 247                                bufs = 0;
 248                                goto write_out_data;
 249                        }
 250                } else if (!locked && buffer_locked(bh)) {
 251                        __journal_file_buffer(jh, commit_transaction,
 252                                                BJ_Locked);
 253                        jbd_unlock_bh_state(bh);
 254                        put_bh(bh);
 255                } else {
 256                        BUFFER_TRACE(bh, "writeout complete: unfile");
 257                        __journal_unfile_buffer(jh);
 258                        jbd_unlock_bh_state(bh);
 259                        if (locked)
 260                                unlock_buffer(bh);
 261                        journal_remove_journal_head(bh);
 262                        /* Once for our safety reference, once for
 263                         * journal_remove_journal_head() */
 264                        put_bh(bh);
 265                        put_bh(bh);
 266                }
 267
 268                if (lock_need_resched(&journal->j_list_lock)) {
 269                        spin_unlock(&journal->j_list_lock);
 270                        goto write_out_data;
 271                }
 272        }
 273        spin_unlock(&journal->j_list_lock);
 274        journal_do_submit_data(wbuf, bufs);
 275}
 276
 277/*
 278 * journal_commit_transaction
 279 *
 280 * The primary function for committing a transaction to the log.  This
 281 * function is called by the journal thread to begin a complete commit.
 282 */
 283void journal_commit_transaction(journal_t *journal)
 284{
 285        transaction_t *commit_transaction;
 286        struct journal_head *jh, *new_jh, *descriptor;
 287        struct buffer_head **wbuf = journal->j_wbuf;
 288        int bufs;
 289        int flags;
 290        int err;
 291        unsigned long blocknr;
 292        char *tagp = NULL;
 293        journal_header_t *header;
 294        journal_block_tag_t *tag = NULL;
 295        int space_left = 0;
 296        int first_tag = 0;
 297        int tag_flag;
 298        int i;
 299
 300        /*
 301         * First job: lock down the current transaction and wait for
 302         * all outstanding updates to complete.
 303         */
 304
 305#ifdef COMMIT_STATS
 306        spin_lock(&journal->j_list_lock);
 307        summarise_journal_usage(journal);
 308        spin_unlock(&journal->j_list_lock);
 309#endif
 310
 311        /* Do we need to erase the effects of a prior journal_flush? */
 312        if (journal->j_flags & JFS_FLUSHED) {
 313                jbd_debug(3, "super block updated\n");
 314                journal_update_superblock(journal, 1);
 315        } else {
 316                jbd_debug(3, "superblock not updated\n");
 317        }
 318
 319        J_ASSERT(journal->j_running_transaction != NULL);
 320        J_ASSERT(journal->j_committing_transaction == NULL);
 321
 322        commit_transaction = journal->j_running_transaction;
 323        J_ASSERT(commit_transaction->t_state == T_RUNNING);
 324
 325        jbd_debug(1, "JBD: starting commit of transaction %d\n",
 326                        commit_transaction->t_tid);
 327
 328        spin_lock(&journal->j_state_lock);
 329        commit_transaction->t_state = T_LOCKED;
 330
 331        spin_lock(&commit_transaction->t_handle_lock);
 332        while (commit_transaction->t_updates) {
 333                DEFINE_WAIT(wait);
 334
 335                prepare_to_wait(&journal->j_wait_updates, &wait,
 336                                        TASK_UNINTERRUPTIBLE);
 337                if (commit_transaction->t_updates) {
 338                        spin_unlock(&commit_transaction->t_handle_lock);
 339                        spin_unlock(&journal->j_state_lock);
 340                        schedule();
 341                        spin_lock(&journal->j_state_lock);
 342                        spin_lock(&commit_transaction->t_handle_lock);
 343                }
 344                finish_wait(&journal->j_wait_updates, &wait);
 345        }
 346        spin_unlock(&commit_transaction->t_handle_lock);
 347
 348        J_ASSERT (commit_transaction->t_outstanding_credits <=
 349                        journal->j_max_transaction_buffers);
 350
 351        /*
 352         * First thing we are allowed to do is to discard any remaining
 353         * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
 354         * that there are no such buffers: if a large filesystem
 355         * operation like a truncate needs to split itself over multiple
 356         * transactions, then it may try to do a journal_restart() while
 357         * there are still BJ_Reserved buffers outstanding.  These must
 358         * be released cleanly from the current transaction.
 359         *
 360         * In this case, the filesystem must still reserve write access
 361         * again before modifying the buffer in the new transaction, but
 362         * we do not require it to remember exactly which old buffers it
 363         * has reserved.  This is consistent with the existing behaviour
 364         * that multiple journal_get_write_access() calls to the same
 365         * buffer are perfectly permissable.
 366         */
 367        while (commit_transaction->t_reserved_list) {
 368                jh = commit_transaction->t_reserved_list;
 369                JBUFFER_TRACE(jh, "reserved, unused: refile");
 370                /*
 371                 * A journal_get_undo_access()+journal_release_buffer() may
 372                 * leave undo-committed data.
 373                 */
 374                if (jh->b_committed_data) {
 375                        struct buffer_head *bh = jh2bh(jh);
 376
 377                        jbd_lock_bh_state(bh);
 378                        jbd_free(jh->b_committed_data, bh->b_size);
 379                        jh->b_committed_data = NULL;
 380                        jbd_unlock_bh_state(bh);
 381                }
 382                journal_refile_buffer(journal, jh);
 383        }
 384
 385        /*
 386         * Now try to drop any written-back buffers from the journal's
 387         * checkpoint lists.  We do this *before* commit because it potentially
 388         * frees some memory
 389         */
 390        spin_lock(&journal->j_list_lock);
 391        __journal_clean_checkpoint_list(journal);
 392        spin_unlock(&journal->j_list_lock);
 393
 394        jbd_debug (3, "JBD: commit phase 1\n");
 395
 396        /*
 397         * Switch to a new revoke table.
 398         */
 399        journal_switch_revoke_table(journal);
 400
 401        commit_transaction->t_state = T_FLUSH;
 402        journal->j_committing_transaction = commit_transaction;
 403        journal->j_running_transaction = NULL;
 404        commit_transaction->t_log_start = journal->j_head;
 405        wake_up(&journal->j_wait_transaction_locked);
 406        spin_unlock(&journal->j_state_lock);
 407
 408        jbd_debug (3, "JBD: commit phase 2\n");
 409
 410        /*
 411         * First, drop modified flag: all accesses to the buffers
 412         * will be tracked for a new trasaction only -bzzz
 413         */
 414        spin_lock(&journal->j_list_lock);
 415        if (commit_transaction->t_buffers) {
 416                new_jh = jh = commit_transaction->t_buffers->b_tnext;
 417                do {
 418                        J_ASSERT_JH(new_jh, new_jh->b_modified == 1 ||
 419                                        new_jh->b_modified == 0);
 420                        new_jh->b_modified = 0;
 421                        new_jh = new_jh->b_tnext;
 422                } while (new_jh != jh);
 423        }
 424        spin_unlock(&journal->j_list_lock);
 425
 426        /*
 427         * Now start flushing things to disk, in the order they appear
 428         * on the transaction lists.  Data blocks go first.
 429         */
 430        err = 0;
 431        journal_submit_data_buffers(journal, commit_transaction);
 432
 433        /*
 434         * Wait for all previously submitted IO to complete.
 435         */
 436        spin_lock(&journal->j_list_lock);
 437        while (commit_transaction->t_locked_list) {
 438                struct buffer_head *bh;
 439
 440                jh = commit_transaction->t_locked_list->b_tprev;
 441                bh = jh2bh(jh);
 442                get_bh(bh);
 443                if (buffer_locked(bh)) {
 444                        spin_unlock(&journal->j_list_lock);
 445                        wait_on_buffer(bh);
 446                        if (unlikely(!buffer_uptodate(bh)))
 447                                err = -EIO;
 448                        spin_lock(&journal->j_list_lock);
 449                }
 450                if (!inverted_lock(journal, bh)) {
 451                        put_bh(bh);
 452                        spin_lock(&journal->j_list_lock);
 453                        continue;
 454                }
 455                if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
 456                        __journal_unfile_buffer(jh);
 457                        jbd_unlock_bh_state(bh);
 458                        journal_remove_journal_head(bh);
 459                        put_bh(bh);
 460                } else {
 461                        jbd_unlock_bh_state(bh);
 462                }
 463                put_bh(bh);
 464                cond_resched_lock(&journal->j_list_lock);
 465        }
 466        spin_unlock(&journal->j_list_lock);
 467
 468        if (err)
 469                journal_abort(journal, err);
 470
 471        journal_write_revoke_records(journal, commit_transaction);
 472
 473        jbd_debug(3, "JBD: commit phase 2\n");
 474
 475        /*
 476         * If we found any dirty or locked buffers, then we should have
 477         * looped back up to the write_out_data label.  If there weren't
 478         * any then journal_clean_data_list should have wiped the list
 479         * clean by now, so check that it is in fact empty.
 480         */
 481        J_ASSERT (commit_transaction->t_sync_datalist == NULL);
 482
 483        jbd_debug (3, "JBD: commit phase 3\n");
 484
 485        /*
 486         * Way to go: we have now written out all of the data for a
 487         * transaction!  Now comes the tricky part: we need to write out
 488         * metadata.  Loop over the transaction's entire buffer list:
 489         */
 490        commit_transaction->t_state = T_COMMIT;
 491
 492        descriptor = NULL;
 493        bufs = 0;
 494        while (commit_transaction->t_buffers) {
 495
 496                /* Find the next buffer to be journaled... */
 497
 498                jh = commit_transaction->t_buffers;
 499
 500                /* If we're in abort mode, we just un-journal the buffer and
 501                   release it for background writing. */
 502
 503                if (is_journal_aborted(journal)) {
 504                        JBUFFER_TRACE(jh, "journal is aborting: refile");
 505                        journal_refile_buffer(journal, jh);
 506                        /* If that was the last one, we need to clean up
 507                         * any descriptor buffers which may have been
 508                         * already allocated, even if we are now
 509                         * aborting. */
 510                        if (!commit_transaction->t_buffers)
 511                                goto start_journal_io;
 512                        continue;
 513                }
 514
 515                /* Make sure we have a descriptor block in which to
 516                   record the metadata buffer. */
 517
 518                if (!descriptor) {
 519                        struct buffer_head *bh;
 520
 521                        J_ASSERT (bufs == 0);
 522
 523                        jbd_debug(4, "JBD: get descriptor\n");
 524
 525                        descriptor = journal_get_descriptor_buffer(journal);
 526                        if (!descriptor) {
 527                                journal_abort(journal, -EIO);
 528                                continue;
 529                        }
 530
 531                        bh = jh2bh(descriptor);
 532                        jbd_debug(4, "JBD: got buffer %llu (%p)\n",
 533                                (unsigned long long)bh->b_blocknr, bh->b_data);
 534                        header = (journal_header_t *)&bh->b_data[0];
 535                        header->h_magic     = cpu_to_be32(JFS_MAGIC_NUMBER);
 536                        header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK);
 537                        header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
 538
 539                        tagp = &bh->b_data[sizeof(journal_header_t)];
 540                        space_left = bh->b_size - sizeof(journal_header_t);
 541                        first_tag = 1;
 542                        set_buffer_jwrite(bh);
 543                        set_buffer_dirty(bh);
 544                        wbuf[bufs++] = bh;
 545
 546                        /* Record it so that we can wait for IO
 547                           completion later */
 548                        BUFFER_TRACE(bh, "ph3: file as descriptor");
 549                        journal_file_buffer(descriptor, commit_transaction,
 550                                        BJ_LogCtl);
 551                }
 552
 553                /* Where is the buffer to be written? */
 554
 555                err = journal_next_log_block(journal, &blocknr);
 556                /* If the block mapping failed, just abandon the buffer
 557                   and repeat this loop: we'll fall into the
 558                   refile-on-abort condition above. */
 559                if (err) {
 560                        journal_abort(journal, err);
 561                        continue;
 562                }
 563
 564                /*
 565                 * start_this_handle() uses t_outstanding_credits to determine
 566                 * the free space in the log, but this counter is changed
 567                 * by journal_next_log_block() also.
 568                 */
 569                commit_transaction->t_outstanding_credits--;
 570
 571                /* Bump b_count to prevent truncate from stumbling over
 572                   the shadowed buffer!  @@@ This can go if we ever get
 573                   rid of the BJ_IO/BJ_Shadow pairing of buffers. */
 574                atomic_inc(&jh2bh(jh)->b_count);
 575
 576                /* Make a temporary IO buffer with which to write it out
 577                   (this will requeue both the metadata buffer and the
 578                   temporary IO buffer). new_bh goes on BJ_IO*/
 579
 580                set_bit(BH_JWrite, &jh2bh(jh)->b_state);
 581                /*
 582                 * akpm: journal_write_metadata_buffer() sets
 583                 * new_bh->b_transaction to commit_transaction.
 584                 * We need to clean this up before we release new_bh
 585                 * (which is of type BJ_IO)
 586                 */
 587                JBUFFER_TRACE(jh, "ph3: write metadata");
 588                flags = journal_write_metadata_buffer(commit_transaction,
 589                                                      jh, &new_jh, blocknr);
 590                set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
 591                wbuf[bufs++] = jh2bh(new_jh);
 592
 593                /* Record the new block's tag in the current descriptor
 594                   buffer */
 595
 596                tag_flag = 0;
 597                if (flags & 1)
 598                        tag_flag |= JFS_FLAG_ESCAPE;
 599                if (!first_tag)
 600                        tag_flag |= JFS_FLAG_SAME_UUID;
 601
 602                tag = (journal_block_tag_t *) tagp;
 603                tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr);
 604                tag->t_flags = cpu_to_be32(tag_flag);
 605                tagp += sizeof(journal_block_tag_t);
 606                space_left -= sizeof(journal_block_tag_t);
 607
 608                if (first_tag) {
 609                        memcpy (tagp, journal->j_uuid, 16);
 610                        tagp += 16;
 611                        space_left -= 16;
 612                        first_tag = 0;
 613                }
 614
 615                /* If there's no more to do, or if the descriptor is full,
 616                   let the IO rip! */
 617
 618                if (bufs == journal->j_wbufsize ||
 619                    commit_transaction->t_buffers == NULL ||
 620                    space_left < sizeof(journal_block_tag_t) + 16) {
 621
 622                        jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
 623
 624                        /* Write an end-of-descriptor marker before
 625                           submitting the IOs.  "tag" still points to
 626                           the last tag we set up. */
 627
 628                        tag->t_flags |= cpu_to_be32(JFS_FLAG_LAST_TAG);
 629
 630start_journal_io:
 631                        for (i = 0; i < bufs; i++) {
 632                                struct buffer_head *bh = wbuf[i];
 633                                lock_buffer(bh);
 634                                clear_buffer_dirty(bh);
 635                                set_buffer_uptodate(bh);
 636                                bh->b_end_io = journal_end_buffer_io_sync;
 637                                submit_bh(WRITE, bh);
 638                        }
 639                        cond_resched();
 640
 641                        /* Force a new descriptor to be generated next
 642                           time round the loop. */
 643                        descriptor = NULL;
 644                        bufs = 0;
 645                }
 646        }
 647
 648        /* Lo and behold: we have just managed to send a transaction to
 649           the log.  Before we can commit it, wait for the IO so far to
 650           complete.  Control buffers being written are on the
 651           transaction's t_log_list queue, and metadata buffers are on
 652           the t_iobuf_list queue.
 653
 654           Wait for the buffers in reverse order.  That way we are
 655           less likely to be woken up until all IOs have completed, and
 656           so we incur less scheduling load.
 657        */
 658
 659        jbd_debug(3, "JBD: commit phase 4\n");
 660
 661        /*
 662         * akpm: these are BJ_IO, and j_list_lock is not needed.
 663         * See __journal_try_to_free_buffer.
 664         */
 665wait_for_iobuf:
 666        while (commit_transaction->t_iobuf_list != NULL) {
 667                struct buffer_head *bh;
 668
 669                jh = commit_transaction->t_iobuf_list->b_tprev;
 670                bh = jh2bh(jh);
 671                if (buffer_locked(bh)) {
 672                        wait_on_buffer(bh);
 673                        goto wait_for_iobuf;
 674                }
 675                if (cond_resched())
 676                        goto wait_for_iobuf;
 677
 678                if (unlikely(!buffer_uptodate(bh)))
 679                        err = -EIO;
 680
 681                clear_buffer_jwrite(bh);
 682
 683                JBUFFER_TRACE(jh, "ph4: unfile after journal write");
 684                journal_unfile_buffer(journal, jh);
 685
 686                /*
 687                 * ->t_iobuf_list should contain only dummy buffer_heads
 688                 * which were created by journal_write_metadata_buffer().
 689                 */
 690                BUFFER_TRACE(bh, "dumping temporary bh");
 691                journal_put_journal_head(jh);
 692                __brelse(bh);
 693                J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
 694                free_buffer_head(bh);
 695
 696                /* We also have to unlock and free the corresponding
 697                   shadowed buffer */
 698                jh = commit_transaction->t_shadow_list->b_tprev;
 699                bh = jh2bh(jh);
 700                clear_bit(BH_JWrite, &bh->b_state);
 701                J_ASSERT_BH(bh, buffer_jbddirty(bh));
 702
 703                /* The metadata is now released for reuse, but we need
 704                   to remember it against this transaction so that when
 705                   we finally commit, we can do any checkpointing
 706                   required. */
 707                JBUFFER_TRACE(jh, "file as BJ_Forget");
 708                journal_file_buffer(jh, commit_transaction, BJ_Forget);
 709                /* Wake up any transactions which were waiting for this
 710                   IO to complete */
 711                wake_up_bit(&bh->b_state, BH_Unshadow);
 712                JBUFFER_TRACE(jh, "brelse shadowed buffer");
 713                __brelse(bh);
 714        }
 715
 716        J_ASSERT (commit_transaction->t_shadow_list == NULL);
 717
 718        jbd_debug(3, "JBD: commit phase 5\n");
 719
 720        /* Here we wait for the revoke record and descriptor record buffers */
 721 wait_for_ctlbuf:
 722        while (commit_transaction->t_log_list != NULL) {
 723                struct buffer_head *bh;
 724
 725                jh = commit_transaction->t_log_list->b_tprev;
 726                bh = jh2bh(jh);
 727                if (buffer_locked(bh)) {
 728                        wait_on_buffer(bh);
 729                        goto wait_for_ctlbuf;
 730                }
 731                if (cond_resched())
 732                        goto wait_for_ctlbuf;
 733
 734                if (unlikely(!buffer_uptodate(bh)))
 735                        err = -EIO;
 736
 737                BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
 738                clear_buffer_jwrite(bh);
 739                journal_unfile_buffer(journal, jh);
 740                journal_put_journal_head(jh);
 741                __brelse(bh);           /* One for getblk */
 742                /* AKPM: bforget here */
 743        }
 744
 745        jbd_debug(3, "JBD: commit phase 6\n");
 746
 747        if (journal_write_commit_record(journal, commit_transaction))
 748                err = -EIO;
 749
 750        if (err)
 751                journal_abort(journal, err);
 752
 753        /* End of a transaction!  Finally, we can do checkpoint
 754           processing: any buffers committed as a result of this
 755           transaction can be removed from any checkpoint list it was on
 756           before. */
 757
 758        jbd_debug(3, "JBD: commit phase 7\n");
 759
 760        J_ASSERT(commit_transaction->t_sync_datalist == NULL);
 761        J_ASSERT(commit_transaction->t_buffers == NULL);
 762        J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
 763        J_ASSERT(commit_transaction->t_iobuf_list == NULL);
 764        J_ASSERT(commit_transaction->t_shadow_list == NULL);
 765        J_ASSERT(commit_transaction->t_log_list == NULL);
 766
 767restart_loop:
 768        /*
 769         * As there are other places (journal_unmap_buffer()) adding buffers
 770         * to this list we have to be careful and hold the j_list_lock.
 771         */
 772        spin_lock(&journal->j_list_lock);
 773        while (commit_transaction->t_forget) {
 774                transaction_t *cp_transaction;
 775                struct buffer_head *bh;
 776
 777                jh = commit_transaction->t_forget;
 778                spin_unlock(&journal->j_list_lock);
 779                bh = jh2bh(jh);
 780                jbd_lock_bh_state(bh);
 781                J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
 782                        jh->b_transaction == journal->j_running_transaction);
 783
 784                /*
 785                 * If there is undo-protected committed data against
 786                 * this buffer, then we can remove it now.  If it is a
 787                 * buffer needing such protection, the old frozen_data
 788                 * field now points to a committed version of the
 789                 * buffer, so rotate that field to the new committed
 790                 * data.
 791                 *
 792                 * Otherwise, we can just throw away the frozen data now.
 793                 */
 794                if (jh->b_committed_data) {
 795                        jbd_free(jh->b_committed_data, bh->b_size);
 796                        jh->b_committed_data = NULL;
 797                        if (jh->b_frozen_data) {
 798                                jh->b_committed_data = jh->b_frozen_data;
 799                                jh->b_frozen_data = NULL;
 800                        }
 801                } else if (jh->b_frozen_data) {
 802                        jbd_free(jh->b_frozen_data, bh->b_size);
 803                        jh->b_frozen_data = NULL;
 804                }
 805
 806                spin_lock(&journal->j_list_lock);
 807                cp_transaction = jh->b_cp_transaction;
 808                if (cp_transaction) {
 809                        JBUFFER_TRACE(jh, "remove from old cp transaction");
 810                        __journal_remove_checkpoint(jh);
 811                }
 812
 813                /* Only re-checkpoint the buffer_head if it is marked
 814                 * dirty.  If the buffer was added to the BJ_Forget list
 815                 * by journal_forget, it may no longer be dirty and
 816                 * there's no point in keeping a checkpoint record for
 817                 * it. */
 818
 819                /* A buffer which has been freed while still being
 820                 * journaled by a previous transaction may end up still
 821                 * being dirty here, but we want to avoid writing back
 822                 * that buffer in the future now that the last use has
 823                 * been committed.  That's not only a performance gain,
 824                 * it also stops aliasing problems if the buffer is left
 825                 * behind for writeback and gets reallocated for another
 826                 * use in a different page. */
 827                if (buffer_freed(bh)) {
 828                        clear_buffer_freed(bh);
 829                        clear_buffer_jbddirty(bh);
 830                }
 831
 832                if (buffer_jbddirty(bh)) {
 833                        JBUFFER_TRACE(jh, "add to new checkpointing trans");
 834                        __journal_insert_checkpoint(jh, commit_transaction);
 835                        JBUFFER_TRACE(jh, "refile for checkpoint writeback");
 836                        __journal_refile_buffer(jh);
 837                        jbd_unlock_bh_state(bh);
 838                } else {
 839                        J_ASSERT_BH(bh, !buffer_dirty(bh));
 840                        /* The buffer on BJ_Forget list and not jbddirty means
 841                         * it has been freed by this transaction and hence it
 842                         * could not have been reallocated until this
 843                         * transaction has committed. *BUT* it could be
 844                         * reallocated once we have written all the data to
 845                         * disk and before we process the buffer on BJ_Forget
 846                         * list. */
 847                        JBUFFER_TRACE(jh, "refile or unfile freed buffer");
 848                        __journal_refile_buffer(jh);
 849                        if (!jh->b_transaction) {
 850                                jbd_unlock_bh_state(bh);
 851                                 /* needs a brelse */
 852                                journal_remove_journal_head(bh);
 853                                release_buffer_page(bh);
 854                        } else
 855                                jbd_unlock_bh_state(bh);
 856                }
 857                cond_resched_lock(&journal->j_list_lock);
 858        }
 859        spin_unlock(&journal->j_list_lock);
 860        /*
 861         * This is a bit sleazy.  We use j_list_lock to protect transition
 862         * of a transaction into T_FINISHED state and calling
 863         * __journal_drop_transaction(). Otherwise we could race with
 864         * other checkpointing code processing the transaction...
 865         */
 866        spin_lock(&journal->j_state_lock);
 867        spin_lock(&journal->j_list_lock);
 868        /*
 869         * Now recheck if some buffers did not get attached to the transaction
 870         * while the lock was dropped...
 871         */
 872        if (commit_transaction->t_forget) {
 873                spin_unlock(&journal->j_list_lock);
 874                spin_unlock(&journal->j_state_lock);
 875                goto restart_loop;
 876        }
 877
 878        /* Done with this transaction! */
 879
 880        jbd_debug(3, "JBD: commit phase 8\n");
 881
 882        J_ASSERT(commit_transaction->t_state == T_COMMIT);
 883
 884        commit_transaction->t_state = T_FINISHED;
 885        J_ASSERT(commit_transaction == journal->j_committing_transaction);
 886        journal->j_commit_sequence = commit_transaction->t_tid;
 887        journal->j_committing_transaction = NULL;
 888        spin_unlock(&journal->j_state_lock);
 889
 890        if (commit_transaction->t_checkpoint_list == NULL &&
 891            commit_transaction->t_checkpoint_io_list == NULL) {
 892                __journal_drop_transaction(journal, commit_transaction);
 893        } else {
 894                if (journal->j_checkpoint_transactions == NULL) {
 895                        journal->j_checkpoint_transactions = commit_transaction;
 896                        commit_transaction->t_cpnext = commit_transaction;
 897                        commit_transaction->t_cpprev = commit_transaction;
 898                } else {
 899                        commit_transaction->t_cpnext =
 900                                journal->j_checkpoint_transactions;
 901                        commit_transaction->t_cpprev =
 902                                commit_transaction->t_cpnext->t_cpprev;
 903                        commit_transaction->t_cpnext->t_cpprev =
 904                                commit_transaction;
 905                        commit_transaction->t_cpprev->t_cpnext =
 906                                commit_transaction;
 907                }
 908        }
 909        spin_unlock(&journal->j_list_lock);
 910
 911        jbd_debug(1, "JBD: commit %d complete, head %d\n",
 912                  journal->j_commit_sequence, journal->j_tail_sequence);
 913
 914        wake_up(&journal->j_wait_done_commit);
 915}
 916