linux/fs/jbd2/commit.c
<<
>>
Prefs
   1/*
   2 * linux/fs/jbd2/commit.c
   3 *
   4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
   5 *
   6 * Copyright 1998 Red Hat corp --- All Rights Reserved
   7 *
   8 * This file is part of the Linux kernel and is made available under
   9 * the terms of the GNU General Public License, version 2, or at your
  10 * option, any later version, incorporated herein by reference.
  11 *
  12 * Journal commit routines for the generic filesystem journaling code;
  13 * part of the ext2fs journaling system.
  14 */
  15
  16#include <linux/time.h>
  17#include <linux/fs.h>
  18#include <linux/jbd2.h>
  19#include <linux/errno.h>
  20#include <linux/slab.h>
  21#include <linux/mm.h>
  22#include <linux/pagemap.h>
  23
  24/*
  25 * Default IO end handler for temporary BJ_IO buffer_heads.
  26 */
  27static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
  28{
  29        BUFFER_TRACE(bh, "");
  30        if (uptodate)
  31                set_buffer_uptodate(bh);
  32        else
  33                clear_buffer_uptodate(bh);
  34        unlock_buffer(bh);
  35}
  36
  37/*
  38 * When an ext3-ordered file is truncated, it is possible that many pages are
  39 * not sucessfully freed, because they are attached to a committing transaction.
  40 * After the transaction commits, these pages are left on the LRU, with no
  41 * ->mapping, and with attached buffers.  These pages are trivially reclaimable
  42 * by the VM, but their apparent absence upsets the VM accounting, and it makes
  43 * the numbers in /proc/meminfo look odd.
  44 *
  45 * So here, we have a buffer which has just come off the forget list.  Look to
  46 * see if we can strip all buffers from the backing page.
  47 *
  48 * Called under lock_journal(), and possibly under journal_datalist_lock.  The
  49 * caller provided us with a ref against the buffer, and we drop that here.
  50 */
  51static void release_buffer_page(struct buffer_head *bh)
  52{
  53        struct page *page;
  54
  55        if (buffer_dirty(bh))
  56                goto nope;
  57        if (atomic_read(&bh->b_count) != 1)
  58                goto nope;
  59        page = bh->b_page;
  60        if (!page)
  61                goto nope;
  62        if (page->mapping)
  63                goto nope;
  64
  65        /* OK, it's a truncated page */
  66        if (TestSetPageLocked(page))
  67                goto nope;
  68
  69        page_cache_get(page);
  70        __brelse(bh);
  71        try_to_free_buffers(page);
  72        unlock_page(page);
  73        page_cache_release(page);
  74        return;
  75
  76nope:
  77        __brelse(bh);
  78}
  79
  80/*
  81 * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
  82 * held.  For ranking reasons we must trylock.  If we lose, schedule away and
  83 * return 0.  j_list_lock is dropped in this case.
  84 */
  85static int inverted_lock(journal_t *journal, struct buffer_head *bh)
  86{
  87        if (!jbd_trylock_bh_state(bh)) {
  88                spin_unlock(&journal->j_list_lock);
  89                schedule();
  90                return 0;
  91        }
  92        return 1;
  93}
  94
  95/* Done it all: now write the commit record.  We should have
  96 * cleaned up our previous buffers by now, so if we are in abort
  97 * mode we can now just skip the rest of the journal write
  98 * entirely.
  99 *
 100 * Returns 1 if the journal needs to be aborted or 0 on success
 101 */
 102static int journal_write_commit_record(journal_t *journal,
 103                                        transaction_t *commit_transaction)
 104{
 105        struct journal_head *descriptor;
 106        struct buffer_head *bh;
 107        int i, ret;
 108        int barrier_done = 0;
 109
 110        if (is_journal_aborted(journal))
 111                return 0;
 112
 113        descriptor = jbd2_journal_get_descriptor_buffer(journal);
 114        if (!descriptor)
 115                return 1;
 116
 117        bh = jh2bh(descriptor);
 118
 119        /* AKPM: buglet - add `i' to tmp! */
 120        for (i = 0; i < bh->b_size; i += 512) {
 121                journal_header_t *tmp = (journal_header_t*)bh->b_data;
 122                tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
 123                tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
 124                tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
 125        }
 126
 127        JBUFFER_TRACE(descriptor, "write commit block");
 128        set_buffer_dirty(bh);
 129        if (journal->j_flags & JBD2_BARRIER) {
 130                set_buffer_ordered(bh);
 131                barrier_done = 1;
 132        }
 133        ret = sync_dirty_buffer(bh);
 134        /* is it possible for another commit to fail at roughly
 135         * the same time as this one?  If so, we don't want to
 136         * trust the barrier flag in the super, but instead want
 137         * to remember if we sent a barrier request
 138         */
 139        if (ret == -EOPNOTSUPP && barrier_done) {
 140                char b[BDEVNAME_SIZE];
 141
 142                printk(KERN_WARNING
 143                        "JBD: barrier-based sync failed on %s - "
 144                        "disabling barriers\n",
 145                        bdevname(journal->j_dev, b));
 146                spin_lock(&journal->j_state_lock);
 147                journal->j_flags &= ~JBD2_BARRIER;
 148                spin_unlock(&journal->j_state_lock);
 149
 150                /* And try again, without the barrier */
 151                clear_buffer_ordered(bh);
 152                set_buffer_uptodate(bh);
 153                set_buffer_dirty(bh);
 154                ret = sync_dirty_buffer(bh);
 155        }
 156        put_bh(bh);             /* One for getblk() */
 157        jbd2_journal_put_journal_head(descriptor);
 158
 159        return (ret == -EIO);
 160}
 161
 162static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
 163{
 164        int i;
 165
 166        for (i = 0; i < bufs; i++) {
 167                wbuf[i]->b_end_io = end_buffer_write_sync;
 168                /* We use-up our safety reference in submit_bh() */
 169                submit_bh(WRITE, wbuf[i]);
 170        }
 171}
 172
 173/*
 174 *  Submit all the data buffers to disk
 175 */
 176static void journal_submit_data_buffers(journal_t *journal,
 177                                transaction_t *commit_transaction)
 178{
 179        struct journal_head *jh;
 180        struct buffer_head *bh;
 181        int locked;
 182        int bufs = 0;
 183        struct buffer_head **wbuf = journal->j_wbuf;
 184
 185        /*
 186         * Whenever we unlock the journal and sleep, things can get added
 187         * onto ->t_sync_datalist, so we have to keep looping back to
 188         * write_out_data until we *know* that the list is empty.
 189         *
 190         * Cleanup any flushed data buffers from the data list.  Even in
 191         * abort mode, we want to flush this out as soon as possible.
 192         */
 193write_out_data:
 194        cond_resched();
 195        spin_lock(&journal->j_list_lock);
 196
 197        while (commit_transaction->t_sync_datalist) {
 198                jh = commit_transaction->t_sync_datalist;
 199                bh = jh2bh(jh);
 200                locked = 0;
 201
 202                /* Get reference just to make sure buffer does not disappear
 203                 * when we are forced to drop various locks */
 204                get_bh(bh);
 205                /* If the buffer is dirty, we need to submit IO and hence
 206                 * we need the buffer lock. We try to lock the buffer without
 207                 * blocking. If we fail, we need to drop j_list_lock and do
 208                 * blocking lock_buffer().
 209                 */
 210                if (buffer_dirty(bh)) {
 211                        if (test_set_buffer_locked(bh)) {
 212                                BUFFER_TRACE(bh, "needs blocking lock");
 213                                spin_unlock(&journal->j_list_lock);
 214                                /* Write out all data to prevent deadlocks */
 215                                journal_do_submit_data(wbuf, bufs);
 216                                bufs = 0;
 217                                lock_buffer(bh);
 218                                spin_lock(&journal->j_list_lock);
 219                        }
 220                        locked = 1;
 221                }
 222                /* We have to get bh_state lock. Again out of order, sigh. */
 223                if (!inverted_lock(journal, bh)) {
 224                        jbd_lock_bh_state(bh);
 225                        spin_lock(&journal->j_list_lock);
 226                }
 227                /* Someone already cleaned up the buffer? */
 228                if (!buffer_jbd(bh)
 229                        || jh->b_transaction != commit_transaction
 230                        || jh->b_jlist != BJ_SyncData) {
 231                        jbd_unlock_bh_state(bh);
 232                        if (locked)
 233                                unlock_buffer(bh);
 234                        BUFFER_TRACE(bh, "already cleaned up");
 235                        put_bh(bh);
 236                        continue;
 237                }
 238                if (locked && test_clear_buffer_dirty(bh)) {
 239                        BUFFER_TRACE(bh, "needs writeout, adding to array");
 240                        wbuf[bufs++] = bh;
 241                        __jbd2_journal_file_buffer(jh, commit_transaction,
 242                                                BJ_Locked);
 243                        jbd_unlock_bh_state(bh);
 244                        if (bufs == journal->j_wbufsize) {
 245                                spin_unlock(&journal->j_list_lock);
 246                                journal_do_submit_data(wbuf, bufs);
 247                                bufs = 0;
 248                                goto write_out_data;
 249                        }
 250                } else if (!locked && buffer_locked(bh)) {
 251                        __jbd2_journal_file_buffer(jh, commit_transaction,
 252                                                BJ_Locked);
 253                        jbd_unlock_bh_state(bh);
 254                        put_bh(bh);
 255                } else {
 256                        BUFFER_TRACE(bh, "writeout complete: unfile");
 257                        __jbd2_journal_unfile_buffer(jh);
 258                        jbd_unlock_bh_state(bh);
 259                        if (locked)
 260                                unlock_buffer(bh);
 261                        jbd2_journal_remove_journal_head(bh);
 262                        /* Once for our safety reference, once for
 263                         * jbd2_journal_remove_journal_head() */
 264                        put_bh(bh);
 265                        put_bh(bh);
 266                }
 267
 268                if (lock_need_resched(&journal->j_list_lock)) {
 269                        spin_unlock(&journal->j_list_lock);
 270                        goto write_out_data;
 271                }
 272        }
 273        spin_unlock(&journal->j_list_lock);
 274        journal_do_submit_data(wbuf, bufs);
 275}
 276
 277static inline void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
 278                                   unsigned long long block)
 279{
 280        tag->t_blocknr = cpu_to_be32(block & (u32)~0);
 281        if (tag_bytes > JBD2_TAG_SIZE32)
 282                tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
 283}
 284
 285/*
 286 * jbd2_journal_commit_transaction
 287 *
 288 * The primary function for committing a transaction to the log.  This
 289 * function is called by the journal thread to begin a complete commit.
 290 */
 291void jbd2_journal_commit_transaction(journal_t *journal)
 292{
 293        transaction_t *commit_transaction;
 294        struct journal_head *jh, *new_jh, *descriptor;
 295        struct buffer_head **wbuf = journal->j_wbuf;
 296        int bufs;
 297        int flags;
 298        int err;
 299        unsigned long long blocknr;
 300        char *tagp = NULL;
 301        journal_header_t *header;
 302        journal_block_tag_t *tag = NULL;
 303        int space_left = 0;
 304        int first_tag = 0;
 305        int tag_flag;
 306        int i;
 307        int tag_bytes = journal_tag_bytes(journal);
 308
 309        /*
 310         * First job: lock down the current transaction and wait for
 311         * all outstanding updates to complete.
 312         */
 313
 314#ifdef COMMIT_STATS
 315        spin_lock(&journal->j_list_lock);
 316        summarise_journal_usage(journal);
 317        spin_unlock(&journal->j_list_lock);
 318#endif
 319
 320        /* Do we need to erase the effects of a prior jbd2_journal_flush? */
 321        if (journal->j_flags & JBD2_FLUSHED) {
 322                jbd_debug(3, "super block updated\n");
 323                jbd2_journal_update_superblock(journal, 1);
 324        } else {
 325                jbd_debug(3, "superblock not updated\n");
 326        }
 327
 328        J_ASSERT(journal->j_running_transaction != NULL);
 329        J_ASSERT(journal->j_committing_transaction == NULL);
 330
 331        commit_transaction = journal->j_running_transaction;
 332        J_ASSERT(commit_transaction->t_state == T_RUNNING);
 333
 334        jbd_debug(1, "JBD: starting commit of transaction %d\n",
 335                        commit_transaction->t_tid);
 336
 337        spin_lock(&journal->j_state_lock);
 338        commit_transaction->t_state = T_LOCKED;
 339
 340        spin_lock(&commit_transaction->t_handle_lock);
 341        while (commit_transaction->t_updates) {
 342                DEFINE_WAIT(wait);
 343
 344                prepare_to_wait(&journal->j_wait_updates, &wait,
 345                                        TASK_UNINTERRUPTIBLE);
 346                if (commit_transaction->t_updates) {
 347                        spin_unlock(&commit_transaction->t_handle_lock);
 348                        spin_unlock(&journal->j_state_lock);
 349                        schedule();
 350                        spin_lock(&journal->j_state_lock);
 351                        spin_lock(&commit_transaction->t_handle_lock);
 352                }
 353                finish_wait(&journal->j_wait_updates, &wait);
 354        }
 355        spin_unlock(&commit_transaction->t_handle_lock);
 356
 357        J_ASSERT (commit_transaction->t_outstanding_credits <=
 358                        journal->j_max_transaction_buffers);
 359
 360        /*
 361         * First thing we are allowed to do is to discard any remaining
 362         * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
 363         * that there are no such buffers: if a large filesystem
 364         * operation like a truncate needs to split itself over multiple
 365         * transactions, then it may try to do a jbd2_journal_restart() while
 366         * there are still BJ_Reserved buffers outstanding.  These must
 367         * be released cleanly from the current transaction.
 368         *
 369         * In this case, the filesystem must still reserve write access
 370         * again before modifying the buffer in the new transaction, but
 371         * we do not require it to remember exactly which old buffers it
 372         * has reserved.  This is consistent with the existing behaviour
 373         * that multiple jbd2_journal_get_write_access() calls to the same
 374         * buffer are perfectly permissable.
 375         */
 376        while (commit_transaction->t_reserved_list) {
 377                jh = commit_transaction->t_reserved_list;
 378                JBUFFER_TRACE(jh, "reserved, unused: refile");
 379                /*
 380                 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
 381                 * leave undo-committed data.
 382                 */
 383                if (jh->b_committed_data) {
 384                        struct buffer_head *bh = jh2bh(jh);
 385
 386                        jbd_lock_bh_state(bh);
 387                        jbd2_free(jh->b_committed_data, bh->b_size);
 388                        jh->b_committed_data = NULL;
 389                        jbd_unlock_bh_state(bh);
 390                }
 391                jbd2_journal_refile_buffer(journal, jh);
 392        }
 393
 394        /*
 395         * Now try to drop any written-back buffers from the journal's
 396         * checkpoint lists.  We do this *before* commit because it potentially
 397         * frees some memory
 398         */
 399        spin_lock(&journal->j_list_lock);
 400        __jbd2_journal_clean_checkpoint_list(journal);
 401        spin_unlock(&journal->j_list_lock);
 402
 403        jbd_debug (3, "JBD: commit phase 1\n");
 404
 405        /*
 406         * Switch to a new revoke table.
 407         */
 408        jbd2_journal_switch_revoke_table(journal);
 409
 410        commit_transaction->t_state = T_FLUSH;
 411        journal->j_committing_transaction = commit_transaction;
 412        journal->j_running_transaction = NULL;
 413        commit_transaction->t_log_start = journal->j_head;
 414        wake_up(&journal->j_wait_transaction_locked);
 415        spin_unlock(&journal->j_state_lock);
 416
 417        jbd_debug (3, "JBD: commit phase 2\n");
 418
 419        /*
 420         * First, drop modified flag: all accesses to the buffers
 421         * will be tracked for a new trasaction only -bzzz
 422         */
 423        spin_lock(&journal->j_list_lock);
 424        if (commit_transaction->t_buffers) {
 425                new_jh = jh = commit_transaction->t_buffers->b_tnext;
 426                do {
 427                        J_ASSERT_JH(new_jh, new_jh->b_modified == 1 ||
 428                                        new_jh->b_modified == 0);
 429                        new_jh->b_modified = 0;
 430                        new_jh = new_jh->b_tnext;
 431                } while (new_jh != jh);
 432        }
 433        spin_unlock(&journal->j_list_lock);
 434
 435        /*
 436         * Now start flushing things to disk, in the order they appear
 437         * on the transaction lists.  Data blocks go first.
 438         */
 439        err = 0;
 440        journal_submit_data_buffers(journal, commit_transaction);
 441
 442        /*
 443         * Wait for all previously submitted IO to complete.
 444         */
 445        spin_lock(&journal->j_list_lock);
 446        while (commit_transaction->t_locked_list) {
 447                struct buffer_head *bh;
 448
 449                jh = commit_transaction->t_locked_list->b_tprev;
 450                bh = jh2bh(jh);
 451                get_bh(bh);
 452                if (buffer_locked(bh)) {
 453                        spin_unlock(&journal->j_list_lock);
 454                        wait_on_buffer(bh);
 455                        if (unlikely(!buffer_uptodate(bh)))
 456                                err = -EIO;
 457                        spin_lock(&journal->j_list_lock);
 458                }
 459                if (!inverted_lock(journal, bh)) {
 460                        put_bh(bh);
 461                        spin_lock(&journal->j_list_lock);
 462                        continue;
 463                }
 464                if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
 465                        __jbd2_journal_unfile_buffer(jh);
 466                        jbd_unlock_bh_state(bh);
 467                        jbd2_journal_remove_journal_head(bh);
 468                        put_bh(bh);
 469                } else {
 470                        jbd_unlock_bh_state(bh);
 471                }
 472                put_bh(bh);
 473                cond_resched_lock(&journal->j_list_lock);
 474        }
 475        spin_unlock(&journal->j_list_lock);
 476
 477        if (err)
 478                jbd2_journal_abort(journal, err);
 479
 480        jbd2_journal_write_revoke_records(journal, commit_transaction);
 481
 482        jbd_debug(3, "JBD: commit phase 2\n");
 483
 484        /*
 485         * If we found any dirty or locked buffers, then we should have
 486         * looped back up to the write_out_data label.  If there weren't
 487         * any then journal_clean_data_list should have wiped the list
 488         * clean by now, so check that it is in fact empty.
 489         */
 490        J_ASSERT (commit_transaction->t_sync_datalist == NULL);
 491
 492        jbd_debug (3, "JBD: commit phase 3\n");
 493
 494        /*
 495         * Way to go: we have now written out all of the data for a
 496         * transaction!  Now comes the tricky part: we need to write out
 497         * metadata.  Loop over the transaction's entire buffer list:
 498         */
 499        commit_transaction->t_state = T_COMMIT;
 500
 501        descriptor = NULL;
 502        bufs = 0;
 503        while (commit_transaction->t_buffers) {
 504
 505                /* Find the next buffer to be journaled... */
 506
 507                jh = commit_transaction->t_buffers;
 508
 509                /* If we're in abort mode, we just un-journal the buffer and
 510                   release it for background writing. */
 511
 512                if (is_journal_aborted(journal)) {
 513                        JBUFFER_TRACE(jh, "journal is aborting: refile");
 514                        jbd2_journal_refile_buffer(journal, jh);
 515                        /* If that was the last one, we need to clean up
 516                         * any descriptor buffers which may have been
 517                         * already allocated, even if we are now
 518                         * aborting. */
 519                        if (!commit_transaction->t_buffers)
 520                                goto start_journal_io;
 521                        continue;
 522                }
 523
 524                /* Make sure we have a descriptor block in which to
 525                   record the metadata buffer. */
 526
 527                if (!descriptor) {
 528                        struct buffer_head *bh;
 529
 530                        J_ASSERT (bufs == 0);
 531
 532                        jbd_debug(4, "JBD: get descriptor\n");
 533
 534                        descriptor = jbd2_journal_get_descriptor_buffer(journal);
 535                        if (!descriptor) {
 536                                jbd2_journal_abort(journal, -EIO);
 537                                continue;
 538                        }
 539
 540                        bh = jh2bh(descriptor);
 541                        jbd_debug(4, "JBD: got buffer %llu (%p)\n",
 542                                (unsigned long long)bh->b_blocknr, bh->b_data);
 543                        header = (journal_header_t *)&bh->b_data[0];
 544                        header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
 545                        header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
 546                        header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
 547
 548                        tagp = &bh->b_data[sizeof(journal_header_t)];
 549                        space_left = bh->b_size - sizeof(journal_header_t);
 550                        first_tag = 1;
 551                        set_buffer_jwrite(bh);
 552                        set_buffer_dirty(bh);
 553                        wbuf[bufs++] = bh;
 554
 555                        /* Record it so that we can wait for IO
 556                           completion later */
 557                        BUFFER_TRACE(bh, "ph3: file as descriptor");
 558                        jbd2_journal_file_buffer(descriptor, commit_transaction,
 559                                        BJ_LogCtl);
 560                }
 561
 562                /* Where is the buffer to be written? */
 563
 564                err = jbd2_journal_next_log_block(journal, &blocknr);
 565                /* If the block mapping failed, just abandon the buffer
 566                   and repeat this loop: we'll fall into the
 567                   refile-on-abort condition above. */
 568                if (err) {
 569                        jbd2_journal_abort(journal, err);
 570                        continue;
 571                }
 572
 573                /*
 574                 * start_this_handle() uses t_outstanding_credits to determine
 575                 * the free space in the log, but this counter is changed
 576                 * by jbd2_journal_next_log_block() also.
 577                 */
 578                commit_transaction->t_outstanding_credits--;
 579
 580                /* Bump b_count to prevent truncate from stumbling over
 581                   the shadowed buffer!  @@@ This can go if we ever get
 582                   rid of the BJ_IO/BJ_Shadow pairing of buffers. */
 583                atomic_inc(&jh2bh(jh)->b_count);
 584
 585                /* Make a temporary IO buffer with which to write it out
 586                   (this will requeue both the metadata buffer and the
 587                   temporary IO buffer). new_bh goes on BJ_IO*/
 588
 589                set_bit(BH_JWrite, &jh2bh(jh)->b_state);
 590                /*
 591                 * akpm: jbd2_journal_write_metadata_buffer() sets
 592                 * new_bh->b_transaction to commit_transaction.
 593                 * We need to clean this up before we release new_bh
 594                 * (which is of type BJ_IO)
 595                 */
 596                JBUFFER_TRACE(jh, "ph3: write metadata");
 597                flags = jbd2_journal_write_metadata_buffer(commit_transaction,
 598                                                      jh, &new_jh, blocknr);
 599                set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
 600                wbuf[bufs++] = jh2bh(new_jh);
 601
 602                /* Record the new block's tag in the current descriptor
 603                   buffer */
 604
 605                tag_flag = 0;
 606                if (flags & 1)
 607                        tag_flag |= JBD2_FLAG_ESCAPE;
 608                if (!first_tag)
 609                        tag_flag |= JBD2_FLAG_SAME_UUID;
 610
 611                tag = (journal_block_tag_t *) tagp;
 612                write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
 613                tag->t_flags = cpu_to_be32(tag_flag);
 614                tagp += tag_bytes;
 615                space_left -= tag_bytes;
 616
 617                if (first_tag) {
 618                        memcpy (tagp, journal->j_uuid, 16);
 619                        tagp += 16;
 620                        space_left -= 16;
 621                        first_tag = 0;
 622                }
 623
 624                /* If there's no more to do, or if the descriptor is full,
 625                   let the IO rip! */
 626
 627                if (bufs == journal->j_wbufsize ||
 628                    commit_transaction->t_buffers == NULL ||
 629                    space_left < tag_bytes + 16) {
 630
 631                        jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
 632
 633                        /* Write an end-of-descriptor marker before
 634                           submitting the IOs.  "tag" still points to
 635                           the last tag we set up. */
 636
 637                        tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
 638
 639start_journal_io:
 640                        for (i = 0; i < bufs; i++) {
 641                                struct buffer_head *bh = wbuf[i];
 642                                lock_buffer(bh);
 643                                clear_buffer_dirty(bh);
 644                                set_buffer_uptodate(bh);
 645                                bh->b_end_io = journal_end_buffer_io_sync;
 646                                submit_bh(WRITE, bh);
 647                        }
 648                        cond_resched();
 649
 650                        /* Force a new descriptor to be generated next
 651                           time round the loop. */
 652                        descriptor = NULL;
 653                        bufs = 0;
 654                }
 655        }
 656
 657        /* Lo and behold: we have just managed to send a transaction to
 658           the log.  Before we can commit it, wait for the IO so far to
 659           complete.  Control buffers being written are on the
 660           transaction's t_log_list queue, and metadata buffers are on
 661           the t_iobuf_list queue.
 662
 663           Wait for the buffers in reverse order.  That way we are
 664           less likely to be woken up until all IOs have completed, and
 665           so we incur less scheduling load.
 666        */
 667
 668        jbd_debug(3, "JBD: commit phase 4\n");
 669
 670        /*
 671         * akpm: these are BJ_IO, and j_list_lock is not needed.
 672         * See __journal_try_to_free_buffer.
 673         */
 674wait_for_iobuf:
 675        while (commit_transaction->t_iobuf_list != NULL) {
 676                struct buffer_head *bh;
 677
 678                jh = commit_transaction->t_iobuf_list->b_tprev;
 679                bh = jh2bh(jh);
 680                if (buffer_locked(bh)) {
 681                        wait_on_buffer(bh);
 682                        goto wait_for_iobuf;
 683                }
 684                if (cond_resched())
 685                        goto wait_for_iobuf;
 686
 687                if (unlikely(!buffer_uptodate(bh)))
 688                        err = -EIO;
 689
 690                clear_buffer_jwrite(bh);
 691
 692                JBUFFER_TRACE(jh, "ph4: unfile after journal write");
 693                jbd2_journal_unfile_buffer(journal, jh);
 694
 695                /*
 696                 * ->t_iobuf_list should contain only dummy buffer_heads
 697                 * which were created by jbd2_journal_write_metadata_buffer().
 698                 */
 699                BUFFER_TRACE(bh, "dumping temporary bh");
 700                jbd2_journal_put_journal_head(jh);
 701                __brelse(bh);
 702                J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
 703                free_buffer_head(bh);
 704
 705                /* We also have to unlock and free the corresponding
 706                   shadowed buffer */
 707                jh = commit_transaction->t_shadow_list->b_tprev;
 708                bh = jh2bh(jh);
 709                clear_bit(BH_JWrite, &bh->b_state);
 710                J_ASSERT_BH(bh, buffer_jbddirty(bh));
 711
 712                /* The metadata is now released for reuse, but we need
 713                   to remember it against this transaction so that when
 714                   we finally commit, we can do any checkpointing
 715                   required. */
 716                JBUFFER_TRACE(jh, "file as BJ_Forget");
 717                jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
 718                /* Wake up any transactions which were waiting for this
 719                   IO to complete */
 720                wake_up_bit(&bh->b_state, BH_Unshadow);
 721                JBUFFER_TRACE(jh, "brelse shadowed buffer");
 722                __brelse(bh);
 723        }
 724
 725        J_ASSERT (commit_transaction->t_shadow_list == NULL);
 726
 727        jbd_debug(3, "JBD: commit phase 5\n");
 728
 729        /* Here we wait for the revoke record and descriptor record buffers */
 730 wait_for_ctlbuf:
 731        while (commit_transaction->t_log_list != NULL) {
 732                struct buffer_head *bh;
 733
 734                jh = commit_transaction->t_log_list->b_tprev;
 735                bh = jh2bh(jh);
 736                if (buffer_locked(bh)) {
 737                        wait_on_buffer(bh);
 738                        goto wait_for_ctlbuf;
 739                }
 740                if (cond_resched())
 741                        goto wait_for_ctlbuf;
 742
 743                if (unlikely(!buffer_uptodate(bh)))
 744                        err = -EIO;
 745
 746                BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
 747                clear_buffer_jwrite(bh);
 748                jbd2_journal_unfile_buffer(journal, jh);
 749                jbd2_journal_put_journal_head(jh);
 750                __brelse(bh);           /* One for getblk */
 751                /* AKPM: bforget here */
 752        }
 753
 754        jbd_debug(3, "JBD: commit phase 6\n");
 755
 756        if (journal_write_commit_record(journal, commit_transaction))
 757                err = -EIO;
 758
 759        if (err)
 760                jbd2_journal_abort(journal, err);
 761
 762        /* End of a transaction!  Finally, we can do checkpoint
 763           processing: any buffers committed as a result of this
 764           transaction can be removed from any checkpoint list it was on
 765           before. */
 766
 767        jbd_debug(3, "JBD: commit phase 7\n");
 768
 769        J_ASSERT(commit_transaction->t_sync_datalist == NULL);
 770        J_ASSERT(commit_transaction->t_buffers == NULL);
 771        J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
 772        J_ASSERT(commit_transaction->t_iobuf_list == NULL);
 773        J_ASSERT(commit_transaction->t_shadow_list == NULL);
 774        J_ASSERT(commit_transaction->t_log_list == NULL);
 775
 776restart_loop:
 777        /*
 778         * As there are other places (journal_unmap_buffer()) adding buffers
 779         * to this list we have to be careful and hold the j_list_lock.
 780         */
 781        spin_lock(&journal->j_list_lock);
 782        while (commit_transaction->t_forget) {
 783                transaction_t *cp_transaction;
 784                struct buffer_head *bh;
 785
 786                jh = commit_transaction->t_forget;
 787                spin_unlock(&journal->j_list_lock);
 788                bh = jh2bh(jh);
 789                jbd_lock_bh_state(bh);
 790                J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
 791                        jh->b_transaction == journal->j_running_transaction);
 792
 793                /*
 794                 * If there is undo-protected committed data against
 795                 * this buffer, then we can remove it now.  If it is a
 796                 * buffer needing such protection, the old frozen_data
 797                 * field now points to a committed version of the
 798                 * buffer, so rotate that field to the new committed
 799                 * data.
 800                 *
 801                 * Otherwise, we can just throw away the frozen data now.
 802                 */
 803                if (jh->b_committed_data) {
 804                        jbd2_free(jh->b_committed_data, bh->b_size);
 805                        jh->b_committed_data = NULL;
 806                        if (jh->b_frozen_data) {
 807                                jh->b_committed_data = jh->b_frozen_data;
 808                                jh->b_frozen_data = NULL;
 809                        }
 810                } else if (jh->b_frozen_data) {
 811                        jbd2_free(jh->b_frozen_data, bh->b_size);
 812                        jh->b_frozen_data = NULL;
 813                }
 814
 815                spin_lock(&journal->j_list_lock);
 816                cp_transaction = jh->b_cp_transaction;
 817                if (cp_transaction) {
 818                        JBUFFER_TRACE(jh, "remove from old cp transaction");
 819                        __jbd2_journal_remove_checkpoint(jh);
 820                }
 821
 822                /* Only re-checkpoint the buffer_head if it is marked
 823                 * dirty.  If the buffer was added to the BJ_Forget list
 824                 * by jbd2_journal_forget, it may no longer be dirty and
 825                 * there's no point in keeping a checkpoint record for
 826                 * it. */
 827
 828                /* A buffer which has been freed while still being
 829                 * journaled by a previous transaction may end up still
 830                 * being dirty here, but we want to avoid writing back
 831                 * that buffer in the future now that the last use has
 832                 * been committed.  That's not only a performance gain,
 833                 * it also stops aliasing problems if the buffer is left
 834                 * behind for writeback and gets reallocated for another
 835                 * use in a different page. */
 836                if (buffer_freed(bh)) {
 837                        clear_buffer_freed(bh);
 838                        clear_buffer_jbddirty(bh);
 839                }
 840
 841                if (buffer_jbddirty(bh)) {
 842                        JBUFFER_TRACE(jh, "add to new checkpointing trans");
 843                        __jbd2_journal_insert_checkpoint(jh, commit_transaction);
 844                        JBUFFER_TRACE(jh, "refile for checkpoint writeback");
 845                        __jbd2_journal_refile_buffer(jh);
 846                        jbd_unlock_bh_state(bh);
 847                } else {
 848                        J_ASSERT_BH(bh, !buffer_dirty(bh));
 849                        /* The buffer on BJ_Forget list and not jbddirty means
 850                         * it has been freed by this transaction and hence it
 851                         * could not have been reallocated until this
 852                         * transaction has committed. *BUT* it could be
 853                         * reallocated once we have written all the data to
 854                         * disk and before we process the buffer on BJ_Forget
 855                         * list. */
 856                        JBUFFER_TRACE(jh, "refile or unfile freed buffer");
 857                        __jbd2_journal_refile_buffer(jh);
 858                        if (!jh->b_transaction) {
 859                                jbd_unlock_bh_state(bh);
 860                                 /* needs a brelse */
 861                                jbd2_journal_remove_journal_head(bh);
 862                                release_buffer_page(bh);
 863                        } else
 864                                jbd_unlock_bh_state(bh);
 865                }
 866                cond_resched_lock(&journal->j_list_lock);
 867        }
 868        spin_unlock(&journal->j_list_lock);
 869        /*
 870         * This is a bit sleazy.  We borrow j_list_lock to protect
 871         * journal->j_committing_transaction in __jbd2_journal_remove_checkpoint.
 872         * Really, __jbd2_journal_remove_checkpoint should be using j_state_lock but
 873         * it's a bit hassle to hold that across __jbd2_journal_remove_checkpoint
 874         */
 875        spin_lock(&journal->j_state_lock);
 876        spin_lock(&journal->j_list_lock);
 877        /*
 878         * Now recheck if some buffers did not get attached to the transaction
 879         * while the lock was dropped...
 880         */
 881        if (commit_transaction->t_forget) {
 882                spin_unlock(&journal->j_list_lock);
 883                spin_unlock(&journal->j_state_lock);
 884                goto restart_loop;
 885        }
 886
 887        /* Done with this transaction! */
 888
 889        jbd_debug(3, "JBD: commit phase 8\n");
 890
 891        J_ASSERT(commit_transaction->t_state == T_COMMIT);
 892
 893        commit_transaction->t_state = T_FINISHED;
 894        J_ASSERT(commit_transaction == journal->j_committing_transaction);
 895        journal->j_commit_sequence = commit_transaction->t_tid;
 896        journal->j_committing_transaction = NULL;
 897        spin_unlock(&journal->j_state_lock);
 898
 899        if (commit_transaction->t_checkpoint_list == NULL &&
 900            commit_transaction->t_checkpoint_io_list == NULL) {
 901                __jbd2_journal_drop_transaction(journal, commit_transaction);
 902        } else {
 903                if (journal->j_checkpoint_transactions == NULL) {
 904                        journal->j_checkpoint_transactions = commit_transaction;
 905                        commit_transaction->t_cpnext = commit_transaction;
 906                        commit_transaction->t_cpprev = commit_transaction;
 907                } else {
 908                        commit_transaction->t_cpnext =
 909                                journal->j_checkpoint_transactions;
 910                        commit_transaction->t_cpprev =
 911                                commit_transaction->t_cpnext->t_cpprev;
 912                        commit_transaction->t_cpnext->t_cpprev =
 913                                commit_transaction;
 914                        commit_transaction->t_cpprev->t_cpnext =
 915                                commit_transaction;
 916                }
 917        }
 918        spin_unlock(&journal->j_list_lock);
 919
 920        jbd_debug(1, "JBD: commit %d complete, head %d\n",
 921                  journal->j_commit_sequence, journal->j_tail_sequence);
 922
 923        wake_up(&journal->j_wait_done_commit);
 924}
 925