linux/fs/jbd/commit.c
<<
>>
Prefs
   1/*
   2 * linux/fs/jbd/commit.c
   3 *
   4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
   5 *
   6 * Copyright 1998 Red Hat corp --- All Rights Reserved
   7 *
   8 * This file is part of the Linux kernel and is made available under
   9 * the terms of the GNU General Public License, version 2, or at your
  10 * option, any later version, incorporated herein by reference.
  11 *
  12 * Journal commit routines for the generic filesystem journaling code;
  13 * part of the ext2fs journaling system.
  14 */
  15
  16#include <linux/time.h>
  17#include <linux/fs.h>
  18#include <linux/jbd.h>
  19#include <linux/errno.h>
  20#include <linux/mm.h>
  21#include <linux/pagemap.h>
  22#include <linux/bio.h>
  23#include <linux/blkdev.h>
  24#include <trace/events/jbd.h>
  25
  26/*
  27 * Default IO end handler for temporary BJ_IO buffer_heads.
  28 */
  29static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
  30{
  31        BUFFER_TRACE(bh, "");
  32        if (uptodate)
  33                set_buffer_uptodate(bh);
  34        else
  35                clear_buffer_uptodate(bh);
  36        unlock_buffer(bh);
  37}
  38
  39/*
  40 * When an ext3-ordered file is truncated, it is possible that many pages are
  41 * not successfully freed, because they are attached to a committing transaction.
  42 * After the transaction commits, these pages are left on the LRU, with no
  43 * ->mapping, and with attached buffers.  These pages are trivially reclaimable
  44 * by the VM, but their apparent absence upsets the VM accounting, and it makes
  45 * the numbers in /proc/meminfo look odd.
  46 *
  47 * So here, we have a buffer which has just come off the forget list.  Look to
  48 * see if we can strip all buffers from the backing page.
  49 *
  50 * Called under journal->j_list_lock.  The caller provided us with a ref
  51 * against the buffer, and we drop that here.
  52 */
  53static void release_buffer_page(struct buffer_head *bh)
  54{
  55        struct page *page;
  56
  57        if (buffer_dirty(bh))
  58                goto nope;
  59        if (atomic_read(&bh->b_count) != 1)
  60                goto nope;
  61        page = bh->b_page;
  62        if (!page)
  63                goto nope;
  64        if (page->mapping)
  65                goto nope;
  66
  67        /* OK, it's a truncated page */
  68        if (!trylock_page(page))
  69                goto nope;
  70
  71        page_cache_get(page);
  72        __brelse(bh);
  73        try_to_free_buffers(page);
  74        unlock_page(page);
  75        page_cache_release(page);
  76        return;
  77
  78nope:
  79        __brelse(bh);
  80}
  81
  82/*
  83 * Decrement reference counter for data buffer. If it has been marked
  84 * 'BH_Freed', release it and the page to which it belongs if possible.
  85 */
  86static void release_data_buffer(struct buffer_head *bh)
  87{
  88        if (buffer_freed(bh)) {
  89                WARN_ON_ONCE(buffer_dirty(bh));
  90                clear_buffer_freed(bh);
  91                clear_buffer_mapped(bh);
  92                clear_buffer_new(bh);
  93                clear_buffer_req(bh);
  94                bh->b_bdev = NULL;
  95                release_buffer_page(bh);
  96        } else
  97                put_bh(bh);
  98}
  99
 100/*
 101 * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
 102 * held.  For ranking reasons we must trylock.  If we lose, schedule away and
 103 * return 0.  j_list_lock is dropped in this case.
 104 */
 105static int inverted_lock(journal_t *journal, struct buffer_head *bh)
 106{
 107        if (!jbd_trylock_bh_state(bh)) {
 108                spin_unlock(&journal->j_list_lock);
 109                schedule();
 110                return 0;
 111        }
 112        return 1;
 113}
 114
 115/* Done it all: now write the commit record.  We should have
 116 * cleaned up our previous buffers by now, so if we are in abort
 117 * mode we can now just skip the rest of the journal write
 118 * entirely.
 119 *
 120 * Returns 1 if the journal needs to be aborted or 0 on success
 121 */
 122static int journal_write_commit_record(journal_t *journal,
 123                                        transaction_t *commit_transaction)
 124{
 125        struct journal_head *descriptor;
 126        struct buffer_head *bh;
 127        journal_header_t *header;
 128        int ret;
 129
 130        if (is_journal_aborted(journal))
 131                return 0;
 132
 133        descriptor = journal_get_descriptor_buffer(journal);
 134        if (!descriptor)
 135                return 1;
 136
 137        bh = jh2bh(descriptor);
 138
 139        header = (journal_header_t *)(bh->b_data);
 140        header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
 141        header->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
 142        header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
 143
 144        JBUFFER_TRACE(descriptor, "write commit block");
 145        set_buffer_dirty(bh);
 146
 147        if (journal->j_flags & JFS_BARRIER)
 148                ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_FLUSH_FUA);
 149        else
 150                ret = sync_dirty_buffer(bh);
 151
 152        put_bh(bh);             /* One for getblk() */
 153        journal_put_journal_head(descriptor);
 154
 155        return (ret == -EIO);
 156}
 157
 158static void journal_do_submit_data(struct buffer_head **wbuf, int bufs,
 159                                   int write_op)
 160{
 161        int i;
 162
 163        for (i = 0; i < bufs; i++) {
 164                wbuf[i]->b_end_io = end_buffer_write_sync;
 165                /*
 166                 * Here we write back pagecache data that may be mmaped. Since
 167                 * we cannot afford to clean the page and set PageWriteback
 168                 * here due to lock ordering (page lock ranks above transaction
 169                 * start), the data can change while IO is in flight. Tell the
 170                 * block layer it should bounce the bio pages if stable data
 171                 * during write is required.
 172                 *
 173                 * We use up our safety reference in submit_bh().
 174                 */
 175                _submit_bh(write_op, wbuf[i], 1 << BIO_SNAP_STABLE);
 176        }
 177}
 178
 179/*
 180 *  Submit all the data buffers to disk
 181 */
 182static int journal_submit_data_buffers(journal_t *journal,
 183                                       transaction_t *commit_transaction,
 184                                       int write_op)
 185{
 186        struct journal_head *jh;
 187        struct buffer_head *bh;
 188        int locked;
 189        int bufs = 0;
 190        struct buffer_head **wbuf = journal->j_wbuf;
 191        int err = 0;
 192
 193        /*
 194         * Whenever we unlock the journal and sleep, things can get added
 195         * onto ->t_sync_datalist, so we have to keep looping back to
 196         * write_out_data until we *know* that the list is empty.
 197         *
 198         * Cleanup any flushed data buffers from the data list.  Even in
 199         * abort mode, we want to flush this out as soon as possible.
 200         */
 201write_out_data:
 202        cond_resched();
 203        spin_lock(&journal->j_list_lock);
 204
 205        while (commit_transaction->t_sync_datalist) {
 206                jh = commit_transaction->t_sync_datalist;
 207                bh = jh2bh(jh);
 208                locked = 0;
 209
 210                /* Get reference just to make sure buffer does not disappear
 211                 * when we are forced to drop various locks */
 212                get_bh(bh);
 213                /* If the buffer is dirty, we need to submit IO and hence
 214                 * we need the buffer lock. We try to lock the buffer without
 215                 * blocking. If we fail, we need to drop j_list_lock and do
 216                 * blocking lock_buffer().
 217                 */
 218                if (buffer_dirty(bh)) {
 219                        if (!trylock_buffer(bh)) {
 220                                BUFFER_TRACE(bh, "needs blocking lock");
 221                                spin_unlock(&journal->j_list_lock);
 222                                trace_jbd_do_submit_data(journal,
 223                                                     commit_transaction);
 224                                /* Write out all data to prevent deadlocks */
 225                                journal_do_submit_data(wbuf, bufs, write_op);
 226                                bufs = 0;
 227                                lock_buffer(bh);
 228                                spin_lock(&journal->j_list_lock);
 229                        }
 230                        locked = 1;
 231                }
 232                /* We have to get bh_state lock. Again out of order, sigh. */
 233                if (!inverted_lock(journal, bh)) {
 234                        jbd_lock_bh_state(bh);
 235                        spin_lock(&journal->j_list_lock);
 236                }
 237                /* Someone already cleaned up the buffer? */
 238                if (!buffer_jbd(bh) || bh2jh(bh) != jh
 239                        || jh->b_transaction != commit_transaction
 240                        || jh->b_jlist != BJ_SyncData) {
 241                        jbd_unlock_bh_state(bh);
 242                        if (locked)
 243                                unlock_buffer(bh);
 244                        BUFFER_TRACE(bh, "already cleaned up");
 245                        release_data_buffer(bh);
 246                        continue;
 247                }
 248                if (locked && test_clear_buffer_dirty(bh)) {
 249                        BUFFER_TRACE(bh, "needs writeout, adding to array");
 250                        wbuf[bufs++] = bh;
 251                        __journal_file_buffer(jh, commit_transaction,
 252                                                BJ_Locked);
 253                        jbd_unlock_bh_state(bh);
 254                        if (bufs == journal->j_wbufsize) {
 255                                spin_unlock(&journal->j_list_lock);
 256                                trace_jbd_do_submit_data(journal,
 257                                                     commit_transaction);
 258                                journal_do_submit_data(wbuf, bufs, write_op);
 259                                bufs = 0;
 260                                goto write_out_data;
 261                        }
 262                } else if (!locked && buffer_locked(bh)) {
 263                        __journal_file_buffer(jh, commit_transaction,
 264                                                BJ_Locked);
 265                        jbd_unlock_bh_state(bh);
 266                        put_bh(bh);
 267                } else {
 268                        BUFFER_TRACE(bh, "writeout complete: unfile");
 269                        if (unlikely(!buffer_uptodate(bh)))
 270                                err = -EIO;
 271                        __journal_unfile_buffer(jh);
 272                        jbd_unlock_bh_state(bh);
 273                        if (locked)
 274                                unlock_buffer(bh);
 275                        release_data_buffer(bh);
 276                }
 277
 278                if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
 279                        spin_unlock(&journal->j_list_lock);
 280                        goto write_out_data;
 281                }
 282        }
 283        spin_unlock(&journal->j_list_lock);
 284        trace_jbd_do_submit_data(journal, commit_transaction);
 285        journal_do_submit_data(wbuf, bufs, write_op);
 286
 287        return err;
 288}
 289
 290/*
 291 * journal_commit_transaction
 292 *
 293 * The primary function for committing a transaction to the log.  This
 294 * function is called by the journal thread to begin a complete commit.
 295 */
 296void journal_commit_transaction(journal_t *journal)
 297{
 298        transaction_t *commit_transaction;
 299        struct journal_head *jh, *new_jh, *descriptor;
 300        struct buffer_head **wbuf = journal->j_wbuf;
 301        int bufs;
 302        int flags;
 303        int err;
 304        unsigned int blocknr;
 305        ktime_t start_time;
 306        u64 commit_time;
 307        char *tagp = NULL;
 308        journal_header_t *header;
 309        journal_block_tag_t *tag = NULL;
 310        int space_left = 0;
 311        int first_tag = 0;
 312        int tag_flag;
 313        int i;
 314        struct blk_plug plug;
 315        int write_op = WRITE;
 316
 317        /*
 318         * First job: lock down the current transaction and wait for
 319         * all outstanding updates to complete.
 320         */
 321
 322        /* Do we need to erase the effects of a prior journal_flush? */
 323        if (journal->j_flags & JFS_FLUSHED) {
 324                jbd_debug(3, "super block updated\n");
 325                mutex_lock(&journal->j_checkpoint_mutex);
 326                /*
 327                 * We hold j_checkpoint_mutex so tail cannot change under us.
 328                 * We don't need any special data guarantees for writing sb
 329                 * since journal is empty and it is ok for write to be
 330                 * flushed only with transaction commit.
 331                 */
 332                journal_update_sb_log_tail(journal, journal->j_tail_sequence,
 333                                           journal->j_tail, WRITE_SYNC);
 334                mutex_unlock(&journal->j_checkpoint_mutex);
 335        } else {
 336                jbd_debug(3, "superblock not updated\n");
 337        }
 338
 339        J_ASSERT(journal->j_running_transaction != NULL);
 340        J_ASSERT(journal->j_committing_transaction == NULL);
 341
 342        commit_transaction = journal->j_running_transaction;
 343
 344        trace_jbd_start_commit(journal, commit_transaction);
 345        jbd_debug(1, "JBD: starting commit of transaction %d\n",
 346                        commit_transaction->t_tid);
 347
 348        spin_lock(&journal->j_state_lock);
 349        J_ASSERT(commit_transaction->t_state == T_RUNNING);
 350        commit_transaction->t_state = T_LOCKED;
 351
 352        trace_jbd_commit_locking(journal, commit_transaction);
 353        spin_lock(&commit_transaction->t_handle_lock);
 354        while (commit_transaction->t_updates) {
 355                DEFINE_WAIT(wait);
 356
 357                prepare_to_wait(&journal->j_wait_updates, &wait,
 358                                        TASK_UNINTERRUPTIBLE);
 359                if (commit_transaction->t_updates) {
 360                        spin_unlock(&commit_transaction->t_handle_lock);
 361                        spin_unlock(&journal->j_state_lock);
 362                        schedule();
 363                        spin_lock(&journal->j_state_lock);
 364                        spin_lock(&commit_transaction->t_handle_lock);
 365                }
 366                finish_wait(&journal->j_wait_updates, &wait);
 367        }
 368        spin_unlock(&commit_transaction->t_handle_lock);
 369
 370        J_ASSERT (commit_transaction->t_outstanding_credits <=
 371                        journal->j_max_transaction_buffers);
 372
 373        /*
 374         * First thing we are allowed to do is to discard any remaining
 375         * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
 376         * that there are no such buffers: if a large filesystem
 377         * operation like a truncate needs to split itself over multiple
 378         * transactions, then it may try to do a journal_restart() while
 379         * there are still BJ_Reserved buffers outstanding.  These must
 380         * be released cleanly from the current transaction.
 381         *
 382         * In this case, the filesystem must still reserve write access
 383         * again before modifying the buffer in the new transaction, but
 384         * we do not require it to remember exactly which old buffers it
 385         * has reserved.  This is consistent with the existing behaviour
 386         * that multiple journal_get_write_access() calls to the same
 387         * buffer are perfectly permissible.
 388         */
 389        while (commit_transaction->t_reserved_list) {
 390                jh = commit_transaction->t_reserved_list;
 391                JBUFFER_TRACE(jh, "reserved, unused: refile");
 392                /*
 393                 * A journal_get_undo_access()+journal_release_buffer() may
 394                 * leave undo-committed data.
 395                 */
 396                if (jh->b_committed_data) {
 397                        struct buffer_head *bh = jh2bh(jh);
 398
 399                        jbd_lock_bh_state(bh);
 400                        jbd_free(jh->b_committed_data, bh->b_size);
 401                        jh->b_committed_data = NULL;
 402                        jbd_unlock_bh_state(bh);
 403                }
 404                journal_refile_buffer(journal, jh);
 405        }
 406
 407        /*
 408         * Now try to drop any written-back buffers from the journal's
 409         * checkpoint lists.  We do this *before* commit because it potentially
 410         * frees some memory
 411         */
 412        spin_lock(&journal->j_list_lock);
 413        __journal_clean_checkpoint_list(journal);
 414        spin_unlock(&journal->j_list_lock);
 415
 416        jbd_debug (3, "JBD: commit phase 1\n");
 417
 418        /*
 419         * Clear revoked flag to reflect there is no revoked buffers
 420         * in the next transaction which is going to be started.
 421         */
 422        journal_clear_buffer_revoked_flags(journal);
 423
 424        /*
 425         * Switch to a new revoke table.
 426         */
 427        journal_switch_revoke_table(journal);
 428
 429        trace_jbd_commit_flushing(journal, commit_transaction);
 430        commit_transaction->t_state = T_FLUSH;
 431        journal->j_committing_transaction = commit_transaction;
 432        journal->j_running_transaction = NULL;
 433        start_time = ktime_get();
 434        commit_transaction->t_log_start = journal->j_head;
 435        wake_up(&journal->j_wait_transaction_locked);
 436        spin_unlock(&journal->j_state_lock);
 437
 438        jbd_debug (3, "JBD: commit phase 2\n");
 439
 440        if (tid_geq(journal->j_commit_waited, commit_transaction->t_tid))
 441                write_op = WRITE_SYNC;
 442
 443        /*
 444         * Now start flushing things to disk, in the order they appear
 445         * on the transaction lists.  Data blocks go first.
 446         */
 447        blk_start_plug(&plug);
 448        err = journal_submit_data_buffers(journal, commit_transaction,
 449                                          write_op);
 450        blk_finish_plug(&plug);
 451
 452        /*
 453         * Wait for all previously submitted IO to complete.
 454         */
 455        spin_lock(&journal->j_list_lock);
 456        while (commit_transaction->t_locked_list) {
 457                struct buffer_head *bh;
 458
 459                jh = commit_transaction->t_locked_list->b_tprev;
 460                bh = jh2bh(jh);
 461                get_bh(bh);
 462                if (buffer_locked(bh)) {
 463                        spin_unlock(&journal->j_list_lock);
 464                        wait_on_buffer(bh);
 465                        spin_lock(&journal->j_list_lock);
 466                }
 467                if (unlikely(!buffer_uptodate(bh))) {
 468                        if (!trylock_page(bh->b_page)) {
 469                                spin_unlock(&journal->j_list_lock);
 470                                lock_page(bh->b_page);
 471                                spin_lock(&journal->j_list_lock);
 472                        }
 473                        if (bh->b_page->mapping)
 474                                set_bit(AS_EIO, &bh->b_page->mapping->flags);
 475
 476                        unlock_page(bh->b_page);
 477                        SetPageError(bh->b_page);
 478                        err = -EIO;
 479                }
 480                if (!inverted_lock(journal, bh)) {
 481                        put_bh(bh);
 482                        spin_lock(&journal->j_list_lock);
 483                        continue;
 484                }
 485                if (buffer_jbd(bh) && bh2jh(bh) == jh &&
 486                    jh->b_transaction == commit_transaction &&
 487                    jh->b_jlist == BJ_Locked)
 488                        __journal_unfile_buffer(jh);
 489                jbd_unlock_bh_state(bh);
 490                release_data_buffer(bh);
 491                cond_resched_lock(&journal->j_list_lock);
 492        }
 493        spin_unlock(&journal->j_list_lock);
 494
 495        if (err) {
 496                char b[BDEVNAME_SIZE];
 497
 498                printk(KERN_WARNING
 499                        "JBD: Detected IO errors while flushing file data "
 500                        "on %s\n", bdevname(journal->j_fs_dev, b));
 501                if (journal->j_flags & JFS_ABORT_ON_SYNCDATA_ERR)
 502                        journal_abort(journal, err);
 503                err = 0;
 504        }
 505
 506        blk_start_plug(&plug);
 507
 508        journal_write_revoke_records(journal, commit_transaction, write_op);
 509
 510        /*
 511         * If we found any dirty or locked buffers, then we should have
 512         * looped back up to the write_out_data label.  If there weren't
 513         * any then journal_clean_data_list should have wiped the list
 514         * clean by now, so check that it is in fact empty.
 515         */
 516        J_ASSERT (commit_transaction->t_sync_datalist == NULL);
 517
 518        jbd_debug (3, "JBD: commit phase 3\n");
 519
 520        /*
 521         * Way to go: we have now written out all of the data for a
 522         * transaction!  Now comes the tricky part: we need to write out
 523         * metadata.  Loop over the transaction's entire buffer list:
 524         */
 525        spin_lock(&journal->j_state_lock);
 526        commit_transaction->t_state = T_COMMIT;
 527        spin_unlock(&journal->j_state_lock);
 528
 529        trace_jbd_commit_logging(journal, commit_transaction);
 530        J_ASSERT(commit_transaction->t_nr_buffers <=
 531                 commit_transaction->t_outstanding_credits);
 532
 533        descriptor = NULL;
 534        bufs = 0;
 535        while (commit_transaction->t_buffers) {
 536
 537                /* Find the next buffer to be journaled... */
 538
 539                jh = commit_transaction->t_buffers;
 540
 541                /* If we're in abort mode, we just un-journal the buffer and
 542                   release it. */
 543
 544                if (is_journal_aborted(journal)) {
 545                        clear_buffer_jbddirty(jh2bh(jh));
 546                        JBUFFER_TRACE(jh, "journal is aborting: refile");
 547                        journal_refile_buffer(journal, jh);
 548                        /* If that was the last one, we need to clean up
 549                         * any descriptor buffers which may have been
 550                         * already allocated, even if we are now
 551                         * aborting. */
 552                        if (!commit_transaction->t_buffers)
 553                                goto start_journal_io;
 554                        continue;
 555                }
 556
 557                /* Make sure we have a descriptor block in which to
 558                   record the metadata buffer. */
 559
 560                if (!descriptor) {
 561                        struct buffer_head *bh;
 562
 563                        J_ASSERT (bufs == 0);
 564
 565                        jbd_debug(4, "JBD: get descriptor\n");
 566
 567                        descriptor = journal_get_descriptor_buffer(journal);
 568                        if (!descriptor) {
 569                                journal_abort(journal, -EIO);
 570                                continue;
 571                        }
 572
 573                        bh = jh2bh(descriptor);
 574                        jbd_debug(4, "JBD: got buffer %llu (%p)\n",
 575                                (unsigned long long)bh->b_blocknr, bh->b_data);
 576                        header = (journal_header_t *)&bh->b_data[0];
 577                        header->h_magic     = cpu_to_be32(JFS_MAGIC_NUMBER);
 578                        header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK);
 579                        header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
 580
 581                        tagp = &bh->b_data[sizeof(journal_header_t)];
 582                        space_left = bh->b_size - sizeof(journal_header_t);
 583                        first_tag = 1;
 584                        set_buffer_jwrite(bh);
 585                        set_buffer_dirty(bh);
 586                        wbuf[bufs++] = bh;
 587
 588                        /* Record it so that we can wait for IO
 589                           completion later */
 590                        BUFFER_TRACE(bh, "ph3: file as descriptor");
 591                        journal_file_buffer(descriptor, commit_transaction,
 592                                        BJ_LogCtl);
 593                }
 594
 595                /* Where is the buffer to be written? */
 596
 597                err = journal_next_log_block(journal, &blocknr);
 598                /* If the block mapping failed, just abandon the buffer
 599                   and repeat this loop: we'll fall into the
 600                   refile-on-abort condition above. */
 601                if (err) {
 602                        journal_abort(journal, err);
 603                        continue;
 604                }
 605
 606                /*
 607                 * start_this_handle() uses t_outstanding_credits to determine
 608                 * the free space in the log, but this counter is changed
 609                 * by journal_next_log_block() also.
 610                 */
 611                commit_transaction->t_outstanding_credits--;
 612
 613                /* Bump b_count to prevent truncate from stumbling over
 614                   the shadowed buffer!  @@@ This can go if we ever get
 615                   rid of the BJ_IO/BJ_Shadow pairing of buffers. */
 616                get_bh(jh2bh(jh));
 617
 618                /* Make a temporary IO buffer with which to write it out
 619                   (this will requeue both the metadata buffer and the
 620                   temporary IO buffer). new_bh goes on BJ_IO*/
 621
 622                set_buffer_jwrite(jh2bh(jh));
 623                /*
 624                 * akpm: journal_write_metadata_buffer() sets
 625                 * new_bh->b_transaction to commit_transaction.
 626                 * We need to clean this up before we release new_bh
 627                 * (which is of type BJ_IO)
 628                 */
 629                JBUFFER_TRACE(jh, "ph3: write metadata");
 630                flags = journal_write_metadata_buffer(commit_transaction,
 631                                                      jh, &new_jh, blocknr);
 632                set_buffer_jwrite(jh2bh(new_jh));
 633                wbuf[bufs++] = jh2bh(new_jh);
 634
 635                /* Record the new block's tag in the current descriptor
 636                   buffer */
 637
 638                tag_flag = 0;
 639                if (flags & 1)
 640                        tag_flag |= JFS_FLAG_ESCAPE;
 641                if (!first_tag)
 642                        tag_flag |= JFS_FLAG_SAME_UUID;
 643
 644                tag = (journal_block_tag_t *) tagp;
 645                tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr);
 646                tag->t_flags = cpu_to_be32(tag_flag);
 647                tagp += sizeof(journal_block_tag_t);
 648                space_left -= sizeof(journal_block_tag_t);
 649
 650                if (first_tag) {
 651                        memcpy (tagp, journal->j_uuid, 16);
 652                        tagp += 16;
 653                        space_left -= 16;
 654                        first_tag = 0;
 655                }
 656
 657                /* If there's no more to do, or if the descriptor is full,
 658                   let the IO rip! */
 659
 660                if (bufs == journal->j_wbufsize ||
 661                    commit_transaction->t_buffers == NULL ||
 662                    space_left < sizeof(journal_block_tag_t) + 16) {
 663
 664                        jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
 665
 666                        /* Write an end-of-descriptor marker before
 667                           submitting the IOs.  "tag" still points to
 668                           the last tag we set up. */
 669
 670                        tag->t_flags |= cpu_to_be32(JFS_FLAG_LAST_TAG);
 671
 672start_journal_io:
 673                        for (i = 0; i < bufs; i++) {
 674                                struct buffer_head *bh = wbuf[i];
 675                                lock_buffer(bh);
 676                                clear_buffer_dirty(bh);
 677                                set_buffer_uptodate(bh);
 678                                bh->b_end_io = journal_end_buffer_io_sync;
 679                                /*
 680                                 * In data=journal mode, here we can end up
 681                                 * writing pagecache data that might be
 682                                 * mmapped. Since we can't afford to clean the
 683                                 * page and set PageWriteback (see the comment
 684                                 * near the other use of _submit_bh()), the
 685                                 * data can change while the write is in
 686                                 * flight.  Tell the block layer to bounce the
 687                                 * bio pages if stable pages are required.
 688                                 */
 689                                _submit_bh(write_op, bh, 1 << BIO_SNAP_STABLE);
 690                        }
 691                        cond_resched();
 692
 693                        /* Force a new descriptor to be generated next
 694                           time round the loop. */
 695                        descriptor = NULL;
 696                        bufs = 0;
 697                }
 698        }
 699
 700        blk_finish_plug(&plug);
 701
 702        /* Lo and behold: we have just managed to send a transaction to
 703           the log.  Before we can commit it, wait for the IO so far to
 704           complete.  Control buffers being written are on the
 705           transaction's t_log_list queue, and metadata buffers are on
 706           the t_iobuf_list queue.
 707
 708           Wait for the buffers in reverse order.  That way we are
 709           less likely to be woken up until all IOs have completed, and
 710           so we incur less scheduling load.
 711        */
 712
 713        jbd_debug(3, "JBD: commit phase 4\n");
 714
 715        /*
 716         * akpm: these are BJ_IO, and j_list_lock is not needed.
 717         * See __journal_try_to_free_buffer.
 718         */
 719wait_for_iobuf:
 720        while (commit_transaction->t_iobuf_list != NULL) {
 721                struct buffer_head *bh;
 722
 723                jh = commit_transaction->t_iobuf_list->b_tprev;
 724                bh = jh2bh(jh);
 725                if (buffer_locked(bh)) {
 726                        wait_on_buffer(bh);
 727                        goto wait_for_iobuf;
 728                }
 729                if (cond_resched())
 730                        goto wait_for_iobuf;
 731
 732                if (unlikely(!buffer_uptodate(bh)))
 733                        err = -EIO;
 734
 735                clear_buffer_jwrite(bh);
 736
 737                JBUFFER_TRACE(jh, "ph4: unfile after journal write");
 738                journal_unfile_buffer(journal, jh);
 739
 740                /*
 741                 * ->t_iobuf_list should contain only dummy buffer_heads
 742                 * which were created by journal_write_metadata_buffer().
 743                 */
 744                BUFFER_TRACE(bh, "dumping temporary bh");
 745                journal_put_journal_head(jh);
 746                __brelse(bh);
 747                J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
 748                free_buffer_head(bh);
 749
 750                /* We also have to unlock and free the corresponding
 751                   shadowed buffer */
 752                jh = commit_transaction->t_shadow_list->b_tprev;
 753                bh = jh2bh(jh);
 754                clear_buffer_jwrite(bh);
 755                J_ASSERT_BH(bh, buffer_jbddirty(bh));
 756
 757                /* The metadata is now released for reuse, but we need
 758                   to remember it against this transaction so that when
 759                   we finally commit, we can do any checkpointing
 760                   required. */
 761                JBUFFER_TRACE(jh, "file as BJ_Forget");
 762                journal_file_buffer(jh, commit_transaction, BJ_Forget);
 763                /*
 764                 * Wake up any transactions which were waiting for this
 765                 * IO to complete. The barrier must be here so that changes
 766                 * by journal_file_buffer() take effect before wake_up_bit()
 767                 * does the waitqueue check.
 768                 */
 769                smp_mb();
 770                wake_up_bit(&bh->b_state, BH_Unshadow);
 771                JBUFFER_TRACE(jh, "brelse shadowed buffer");
 772                __brelse(bh);
 773        }
 774
 775        J_ASSERT (commit_transaction->t_shadow_list == NULL);
 776
 777        jbd_debug(3, "JBD: commit phase 5\n");
 778
 779        /* Here we wait for the revoke record and descriptor record buffers */
 780 wait_for_ctlbuf:
 781        while (commit_transaction->t_log_list != NULL) {
 782                struct buffer_head *bh;
 783
 784                jh = commit_transaction->t_log_list->b_tprev;
 785                bh = jh2bh(jh);
 786                if (buffer_locked(bh)) {
 787                        wait_on_buffer(bh);
 788                        goto wait_for_ctlbuf;
 789                }
 790                if (cond_resched())
 791                        goto wait_for_ctlbuf;
 792
 793                if (unlikely(!buffer_uptodate(bh)))
 794                        err = -EIO;
 795
 796                BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
 797                clear_buffer_jwrite(bh);
 798                journal_unfile_buffer(journal, jh);
 799                journal_put_journal_head(jh);
 800                __brelse(bh);           /* One for getblk */
 801                /* AKPM: bforget here */
 802        }
 803
 804        if (err)
 805                journal_abort(journal, err);
 806
 807        jbd_debug(3, "JBD: commit phase 6\n");
 808
 809        /* All metadata is written, now write commit record and do cleanup */
 810        spin_lock(&journal->j_state_lock);
 811        J_ASSERT(commit_transaction->t_state == T_COMMIT);
 812        commit_transaction->t_state = T_COMMIT_RECORD;
 813        spin_unlock(&journal->j_state_lock);
 814
 815        if (journal_write_commit_record(journal, commit_transaction))
 816                err = -EIO;
 817
 818        if (err)
 819                journal_abort(journal, err);
 820
 821        /* End of a transaction!  Finally, we can do checkpoint
 822           processing: any buffers committed as a result of this
 823           transaction can be removed from any checkpoint list it was on
 824           before. */
 825
 826        jbd_debug(3, "JBD: commit phase 7\n");
 827
 828        J_ASSERT(commit_transaction->t_sync_datalist == NULL);
 829        J_ASSERT(commit_transaction->t_buffers == NULL);
 830        J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
 831        J_ASSERT(commit_transaction->t_iobuf_list == NULL);
 832        J_ASSERT(commit_transaction->t_shadow_list == NULL);
 833        J_ASSERT(commit_transaction->t_log_list == NULL);
 834
 835restart_loop:
 836        /*
 837         * As there are other places (journal_unmap_buffer()) adding buffers
 838         * to this list we have to be careful and hold the j_list_lock.
 839         */
 840        spin_lock(&journal->j_list_lock);
 841        while (commit_transaction->t_forget) {
 842                transaction_t *cp_transaction;
 843                struct buffer_head *bh;
 844                int try_to_free = 0;
 845
 846                jh = commit_transaction->t_forget;
 847                spin_unlock(&journal->j_list_lock);
 848                bh = jh2bh(jh);
 849                /*
 850                 * Get a reference so that bh cannot be freed before we are
 851                 * done with it.
 852                 */
 853                get_bh(bh);
 854                jbd_lock_bh_state(bh);
 855                J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
 856                        jh->b_transaction == journal->j_running_transaction);
 857
 858                /*
 859                 * If there is undo-protected committed data against
 860                 * this buffer, then we can remove it now.  If it is a
 861                 * buffer needing such protection, the old frozen_data
 862                 * field now points to a committed version of the
 863                 * buffer, so rotate that field to the new committed
 864                 * data.
 865                 *
 866                 * Otherwise, we can just throw away the frozen data now.
 867                 */
 868                if (jh->b_committed_data) {
 869                        jbd_free(jh->b_committed_data, bh->b_size);
 870                        jh->b_committed_data = NULL;
 871                        if (jh->b_frozen_data) {
 872                                jh->b_committed_data = jh->b_frozen_data;
 873                                jh->b_frozen_data = NULL;
 874                        }
 875                } else if (jh->b_frozen_data) {
 876                        jbd_free(jh->b_frozen_data, bh->b_size);
 877                        jh->b_frozen_data = NULL;
 878                }
 879
 880                spin_lock(&journal->j_list_lock);
 881                cp_transaction = jh->b_cp_transaction;
 882                if (cp_transaction) {
 883                        JBUFFER_TRACE(jh, "remove from old cp transaction");
 884                        __journal_remove_checkpoint(jh);
 885                }
 886
 887                /* Only re-checkpoint the buffer_head if it is marked
 888                 * dirty.  If the buffer was added to the BJ_Forget list
 889                 * by journal_forget, it may no longer be dirty and
 890                 * there's no point in keeping a checkpoint record for
 891                 * it. */
 892
 893                /*
 894                 * A buffer which has been freed while still being journaled by
 895                 * a previous transaction.
 896                 */
 897                if (buffer_freed(bh)) {
 898                        /*
 899                         * If the running transaction is the one containing
 900                         * "add to orphan" operation (b_next_transaction !=
 901                         * NULL), we have to wait for that transaction to
 902                         * commit before we can really get rid of the buffer.
 903                         * So just clear b_modified to not confuse transaction
 904                         * credit accounting and refile the buffer to
 905                         * BJ_Forget of the running transaction. If the just
 906                         * committed transaction contains "add to orphan"
 907                         * operation, we can completely invalidate the buffer
 908                         * now. We are rather throughout in that since the
 909                         * buffer may be still accessible when blocksize <
 910                         * pagesize and it is attached to the last partial
 911                         * page.
 912                         */
 913                        jh->b_modified = 0;
 914                        if (!jh->b_next_transaction) {
 915                                clear_buffer_freed(bh);
 916                                clear_buffer_jbddirty(bh);
 917                                clear_buffer_mapped(bh);
 918                                clear_buffer_new(bh);
 919                                clear_buffer_req(bh);
 920                                bh->b_bdev = NULL;
 921                        }
 922                }
 923
 924                if (buffer_jbddirty(bh)) {
 925                        JBUFFER_TRACE(jh, "add to new checkpointing trans");
 926                        __journal_insert_checkpoint(jh, commit_transaction);
 927                        if (is_journal_aborted(journal))
 928                                clear_buffer_jbddirty(bh);
 929                } else {
 930                        J_ASSERT_BH(bh, !buffer_dirty(bh));
 931                        /*
 932                         * The buffer on BJ_Forget list and not jbddirty means
 933                         * it has been freed by this transaction and hence it
 934                         * could not have been reallocated until this
 935                         * transaction has committed. *BUT* it could be
 936                         * reallocated once we have written all the data to
 937                         * disk and before we process the buffer on BJ_Forget
 938                         * list.
 939                         */
 940                        if (!jh->b_next_transaction)
 941                                try_to_free = 1;
 942                }
 943                JBUFFER_TRACE(jh, "refile or unfile freed buffer");
 944                __journal_refile_buffer(jh);
 945                jbd_unlock_bh_state(bh);
 946                if (try_to_free)
 947                        release_buffer_page(bh);
 948                else
 949                        __brelse(bh);
 950                cond_resched_lock(&journal->j_list_lock);
 951        }
 952        spin_unlock(&journal->j_list_lock);
 953        /*
 954         * This is a bit sleazy.  We use j_list_lock to protect transition
 955         * of a transaction into T_FINISHED state and calling
 956         * __journal_drop_transaction(). Otherwise we could race with
 957         * other checkpointing code processing the transaction...
 958         */
 959        spin_lock(&journal->j_state_lock);
 960        spin_lock(&journal->j_list_lock);
 961        /*
 962         * Now recheck if some buffers did not get attached to the transaction
 963         * while the lock was dropped...
 964         */
 965        if (commit_transaction->t_forget) {
 966                spin_unlock(&journal->j_list_lock);
 967                spin_unlock(&journal->j_state_lock);
 968                goto restart_loop;
 969        }
 970
 971        /* Done with this transaction! */
 972
 973        jbd_debug(3, "JBD: commit phase 8\n");
 974
 975        J_ASSERT(commit_transaction->t_state == T_COMMIT_RECORD);
 976
 977        commit_transaction->t_state = T_FINISHED;
 978        J_ASSERT(commit_transaction == journal->j_committing_transaction);
 979        journal->j_commit_sequence = commit_transaction->t_tid;
 980        journal->j_committing_transaction = NULL;
 981        commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
 982
 983        /*
 984         * weight the commit time higher than the average time so we don't
 985         * react too strongly to vast changes in commit time
 986         */
 987        if (likely(journal->j_average_commit_time))
 988                journal->j_average_commit_time = (commit_time*3 +
 989                                journal->j_average_commit_time) / 4;
 990        else
 991                journal->j_average_commit_time = commit_time;
 992
 993        spin_unlock(&journal->j_state_lock);
 994
 995        if (commit_transaction->t_checkpoint_list == NULL &&
 996            commit_transaction->t_checkpoint_io_list == NULL) {
 997                __journal_drop_transaction(journal, commit_transaction);
 998        } else {
 999                if (journal->j_checkpoint_transactions == NULL) {
1000                        journal->j_checkpoint_transactions = commit_transaction;
1001                        commit_transaction->t_cpnext = commit_transaction;
1002                        commit_transaction->t_cpprev = commit_transaction;
1003                } else {
1004                        commit_transaction->t_cpnext =
1005                                journal->j_checkpoint_transactions;
1006                        commit_transaction->t_cpprev =
1007                                commit_transaction->t_cpnext->t_cpprev;
1008                        commit_transaction->t_cpnext->t_cpprev =
1009                                commit_transaction;
1010                        commit_transaction->t_cpprev->t_cpnext =
1011                                commit_transaction;
1012                }
1013        }
1014        spin_unlock(&journal->j_list_lock);
1015
1016        trace_jbd_end_commit(journal, commit_transaction);
1017        jbd_debug(1, "JBD: commit %d complete, head %d\n",
1018                  journal->j_commit_sequence, journal->j_tail_sequence);
1019
1020        wake_up(&journal->j_wait_done_commit);
1021}
1022