linux/fs/jbd/commit.c
<<
>>
Prefs
   1/*
   2 * linux/fs/jbd/commit.c
   3 *
   4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
   5 *
   6 * Copyright 1998 Red Hat corp --- All Rights Reserved
   7 *
   8 * This file is part of the Linux kernel and is made available under
   9 * the terms of the GNU General Public License, version 2, or at your
  10 * option, any later version, incorporated herein by reference.
  11 *
  12 * Journal commit routines for the generic filesystem journaling code;
  13 * part of the ext2fs journaling system.
  14 */
  15
  16#include <linux/time.h>
  17#include <linux/fs.h>
  18#include <linux/jbd.h>
  19#include <linux/errno.h>
  20#include <linux/slab.h>
  21#include <linux/mm.h>
  22#include <linux/pagemap.h>
  23#include <linux/bio.h>
  24
  25/*
  26 * Default IO end handler for temporary BJ_IO buffer_heads.
  27 */
  28static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
  29{
  30        BUFFER_TRACE(bh, "");
  31        if (uptodate)
  32                set_buffer_uptodate(bh);
  33        else
  34                clear_buffer_uptodate(bh);
  35        unlock_buffer(bh);
  36}
  37
  38/*
  39 * When an ext3-ordered file is truncated, it is possible that many pages are
  40 * not successfully freed, because they are attached to a committing transaction.
  41 * After the transaction commits, these pages are left on the LRU, with no
  42 * ->mapping, and with attached buffers.  These pages are trivially reclaimable
  43 * by the VM, but their apparent absence upsets the VM accounting, and it makes
  44 * the numbers in /proc/meminfo look odd.
  45 *
  46 * So here, we have a buffer which has just come off the forget list.  Look to
  47 * see if we can strip all buffers from the backing page.
  48 *
  49 * Called under journal->j_list_lock.  The caller provided us with a ref
  50 * against the buffer, and we drop that here.
  51 */
  52static void release_buffer_page(struct buffer_head *bh)
  53{
  54        struct page *page;
  55
  56        if (buffer_dirty(bh))
  57                goto nope;
  58        if (atomic_read(&bh->b_count) != 1)
  59                goto nope;
  60        page = bh->b_page;
  61        if (!page)
  62                goto nope;
  63        if (page->mapping)
  64                goto nope;
  65
  66        /* OK, it's a truncated page */
  67        if (!trylock_page(page))
  68                goto nope;
  69
  70        page_cache_get(page);
  71        __brelse(bh);
  72        try_to_free_buffers(page);
  73        unlock_page(page);
  74        page_cache_release(page);
  75        return;
  76
  77nope:
  78        __brelse(bh);
  79}
  80
  81/*
  82 * Decrement reference counter for data buffer. If it has been marked
  83 * 'BH_Freed', release it and the page to which it belongs if possible.
  84 */
  85static void release_data_buffer(struct buffer_head *bh)
  86{
  87        if (buffer_freed(bh)) {
  88                clear_buffer_freed(bh);
  89                release_buffer_page(bh);
  90        } else
  91                put_bh(bh);
  92}
  93
  94/*
  95 * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
  96 * held.  For ranking reasons we must trylock.  If we lose, schedule away and
  97 * return 0.  j_list_lock is dropped in this case.
  98 */
  99static int inverted_lock(journal_t *journal, struct buffer_head *bh)
 100{
 101        if (!jbd_trylock_bh_state(bh)) {
 102                spin_unlock(&journal->j_list_lock);
 103                schedule();
 104                return 0;
 105        }
 106        return 1;
 107}
 108
 109/* Done it all: now write the commit record.  We should have
 110 * cleaned up our previous buffers by now, so if we are in abort
 111 * mode we can now just skip the rest of the journal write
 112 * entirely.
 113 *
 114 * Returns 1 if the journal needs to be aborted or 0 on success
 115 */
 116static int journal_write_commit_record(journal_t *journal,
 117                                        transaction_t *commit_transaction)
 118{
 119        struct journal_head *descriptor;
 120        struct buffer_head *bh;
 121        journal_header_t *header;
 122        int ret;
 123        int barrier_done = 0;
 124
 125        if (is_journal_aborted(journal))
 126                return 0;
 127
 128        descriptor = journal_get_descriptor_buffer(journal);
 129        if (!descriptor)
 130                return 1;
 131
 132        bh = jh2bh(descriptor);
 133
 134        header = (journal_header_t *)(bh->b_data);
 135        header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
 136        header->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
 137        header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
 138
 139        JBUFFER_TRACE(descriptor, "write commit block");
 140        set_buffer_dirty(bh);
 141        if (journal->j_flags & JFS_BARRIER) {
 142                set_buffer_ordered(bh);
 143                barrier_done = 1;
 144        }
 145        ret = sync_dirty_buffer(bh);
 146        if (barrier_done)
 147                clear_buffer_ordered(bh);
 148        /* is it possible for another commit to fail at roughly
 149         * the same time as this one?  If so, we don't want to
 150         * trust the barrier flag in the super, but instead want
 151         * to remember if we sent a barrier request
 152         */
 153        if (ret == -EOPNOTSUPP && barrier_done) {
 154                char b[BDEVNAME_SIZE];
 155
 156                printk(KERN_WARNING
 157                        "JBD: barrier-based sync failed on %s - "
 158                        "disabling barriers\n",
 159                        bdevname(journal->j_dev, b));
 160                spin_lock(&journal->j_state_lock);
 161                journal->j_flags &= ~JFS_BARRIER;
 162                spin_unlock(&journal->j_state_lock);
 163
 164                /* And try again, without the barrier */
 165                set_buffer_uptodate(bh);
 166                set_buffer_dirty(bh);
 167                ret = sync_dirty_buffer(bh);
 168        }
 169        put_bh(bh);             /* One for getblk() */
 170        journal_put_journal_head(descriptor);
 171
 172        return (ret == -EIO);
 173}
 174
 175static void journal_do_submit_data(struct buffer_head **wbuf, int bufs,
 176                                   int write_op)
 177{
 178        int i;
 179
 180        for (i = 0; i < bufs; i++) {
 181                wbuf[i]->b_end_io = end_buffer_write_sync;
 182                /* We use-up our safety reference in submit_bh() */
 183                submit_bh(write_op, wbuf[i]);
 184        }
 185}
 186
 187/*
 188 *  Submit all the data buffers to disk
 189 */
 190static int journal_submit_data_buffers(journal_t *journal,
 191                                       transaction_t *commit_transaction,
 192                                       int write_op)
 193{
 194        struct journal_head *jh;
 195        struct buffer_head *bh;
 196        int locked;
 197        int bufs = 0;
 198        struct buffer_head **wbuf = journal->j_wbuf;
 199        int err = 0;
 200
 201        /*
 202         * Whenever we unlock the journal and sleep, things can get added
 203         * onto ->t_sync_datalist, so we have to keep looping back to
 204         * write_out_data until we *know* that the list is empty.
 205         *
 206         * Cleanup any flushed data buffers from the data list.  Even in
 207         * abort mode, we want to flush this out as soon as possible.
 208         */
 209write_out_data:
 210        cond_resched();
 211        spin_lock(&journal->j_list_lock);
 212
 213        while (commit_transaction->t_sync_datalist) {
 214                jh = commit_transaction->t_sync_datalist;
 215                bh = jh2bh(jh);
 216                locked = 0;
 217
 218                /* Get reference just to make sure buffer does not disappear
 219                 * when we are forced to drop various locks */
 220                get_bh(bh);
 221                /* If the buffer is dirty, we need to submit IO and hence
 222                 * we need the buffer lock. We try to lock the buffer without
 223                 * blocking. If we fail, we need to drop j_list_lock and do
 224                 * blocking lock_buffer().
 225                 */
 226                if (buffer_dirty(bh)) {
 227                        if (!trylock_buffer(bh)) {
 228                                BUFFER_TRACE(bh, "needs blocking lock");
 229                                spin_unlock(&journal->j_list_lock);
 230                                /* Write out all data to prevent deadlocks */
 231                                journal_do_submit_data(wbuf, bufs, write_op);
 232                                bufs = 0;
 233                                lock_buffer(bh);
 234                                spin_lock(&journal->j_list_lock);
 235                        }
 236                        locked = 1;
 237                }
 238                /* We have to get bh_state lock. Again out of order, sigh. */
 239                if (!inverted_lock(journal, bh)) {
 240                        jbd_lock_bh_state(bh);
 241                        spin_lock(&journal->j_list_lock);
 242                }
 243                /* Someone already cleaned up the buffer? */
 244                if (!buffer_jbd(bh) || bh2jh(bh) != jh
 245                        || jh->b_transaction != commit_transaction
 246                        || jh->b_jlist != BJ_SyncData) {
 247                        jbd_unlock_bh_state(bh);
 248                        if (locked)
 249                                unlock_buffer(bh);
 250                        BUFFER_TRACE(bh, "already cleaned up");
 251                        release_data_buffer(bh);
 252                        continue;
 253                }
 254                if (locked && test_clear_buffer_dirty(bh)) {
 255                        BUFFER_TRACE(bh, "needs writeout, adding to array");
 256                        wbuf[bufs++] = bh;
 257                        __journal_file_buffer(jh, commit_transaction,
 258                                                BJ_Locked);
 259                        jbd_unlock_bh_state(bh);
 260                        if (bufs == journal->j_wbufsize) {
 261                                spin_unlock(&journal->j_list_lock);
 262                                journal_do_submit_data(wbuf, bufs, write_op);
 263                                bufs = 0;
 264                                goto write_out_data;
 265                        }
 266                } else if (!locked && buffer_locked(bh)) {
 267                        __journal_file_buffer(jh, commit_transaction,
 268                                                BJ_Locked);
 269                        jbd_unlock_bh_state(bh);
 270                        put_bh(bh);
 271                } else {
 272                        BUFFER_TRACE(bh, "writeout complete: unfile");
 273                        if (unlikely(!buffer_uptodate(bh)))
 274                                err = -EIO;
 275                        __journal_unfile_buffer(jh);
 276                        jbd_unlock_bh_state(bh);
 277                        if (locked)
 278                                unlock_buffer(bh);
 279                        journal_remove_journal_head(bh);
 280                        /* One for our safety reference, other for
 281                         * journal_remove_journal_head() */
 282                        put_bh(bh);
 283                        release_data_buffer(bh);
 284                }
 285
 286                if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
 287                        spin_unlock(&journal->j_list_lock);
 288                        goto write_out_data;
 289                }
 290        }
 291        spin_unlock(&journal->j_list_lock);
 292        journal_do_submit_data(wbuf, bufs, write_op);
 293
 294        return err;
 295}
 296
 297/*
 298 * journal_commit_transaction
 299 *
 300 * The primary function for committing a transaction to the log.  This
 301 * function is called by the journal thread to begin a complete commit.
 302 */
 303void journal_commit_transaction(journal_t *journal)
 304{
 305        transaction_t *commit_transaction;
 306        struct journal_head *jh, *new_jh, *descriptor;
 307        struct buffer_head **wbuf = journal->j_wbuf;
 308        int bufs;
 309        int flags;
 310        int err;
 311        unsigned int blocknr;
 312        ktime_t start_time;
 313        u64 commit_time;
 314        char *tagp = NULL;
 315        journal_header_t *header;
 316        journal_block_tag_t *tag = NULL;
 317        int space_left = 0;
 318        int first_tag = 0;
 319        int tag_flag;
 320        int i;
 321        int write_op = WRITE;
 322
 323        /*
 324         * First job: lock down the current transaction and wait for
 325         * all outstanding updates to complete.
 326         */
 327
 328#ifdef COMMIT_STATS
 329        spin_lock(&journal->j_list_lock);
 330        summarise_journal_usage(journal);
 331        spin_unlock(&journal->j_list_lock);
 332#endif
 333
 334        /* Do we need to erase the effects of a prior journal_flush? */
 335        if (journal->j_flags & JFS_FLUSHED) {
 336                jbd_debug(3, "super block updated\n");
 337                journal_update_superblock(journal, 1);
 338        } else {
 339                jbd_debug(3, "superblock not updated\n");
 340        }
 341
 342        J_ASSERT(journal->j_running_transaction != NULL);
 343        J_ASSERT(journal->j_committing_transaction == NULL);
 344
 345        commit_transaction = journal->j_running_transaction;
 346        J_ASSERT(commit_transaction->t_state == T_RUNNING);
 347
 348        jbd_debug(1, "JBD: starting commit of transaction %d\n",
 349                        commit_transaction->t_tid);
 350
 351        spin_lock(&journal->j_state_lock);
 352        commit_transaction->t_state = T_LOCKED;
 353
 354        /*
 355         * Use plugged writes here, since we want to submit several before
 356         * we unplug the device. We don't do explicit unplugging in here,
 357         * instead we rely on sync_buffer() doing the unplug for us.
 358         */
 359        if (commit_transaction->t_synchronous_commit)
 360                write_op = WRITE_SYNC_PLUG;
 361        spin_lock(&commit_transaction->t_handle_lock);
 362        while (commit_transaction->t_updates) {
 363                DEFINE_WAIT(wait);
 364
 365                prepare_to_wait(&journal->j_wait_updates, &wait,
 366                                        TASK_UNINTERRUPTIBLE);
 367                if (commit_transaction->t_updates) {
 368                        spin_unlock(&commit_transaction->t_handle_lock);
 369                        spin_unlock(&journal->j_state_lock);
 370                        schedule();
 371                        spin_lock(&journal->j_state_lock);
 372                        spin_lock(&commit_transaction->t_handle_lock);
 373                }
 374                finish_wait(&journal->j_wait_updates, &wait);
 375        }
 376        spin_unlock(&commit_transaction->t_handle_lock);
 377
 378        J_ASSERT (commit_transaction->t_outstanding_credits <=
 379                        journal->j_max_transaction_buffers);
 380
 381        /*
 382         * First thing we are allowed to do is to discard any remaining
 383         * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
 384         * that there are no such buffers: if a large filesystem
 385         * operation like a truncate needs to split itself over multiple
 386         * transactions, then it may try to do a journal_restart() while
 387         * there are still BJ_Reserved buffers outstanding.  These must
 388         * be released cleanly from the current transaction.
 389         *
 390         * In this case, the filesystem must still reserve write access
 391         * again before modifying the buffer in the new transaction, but
 392         * we do not require it to remember exactly which old buffers it
 393         * has reserved.  This is consistent with the existing behaviour
 394         * that multiple journal_get_write_access() calls to the same
 395         * buffer are perfectly permissable.
 396         */
 397        while (commit_transaction->t_reserved_list) {
 398                jh = commit_transaction->t_reserved_list;
 399                JBUFFER_TRACE(jh, "reserved, unused: refile");
 400                /*
 401                 * A journal_get_undo_access()+journal_release_buffer() may
 402                 * leave undo-committed data.
 403                 */
 404                if (jh->b_committed_data) {
 405                        struct buffer_head *bh = jh2bh(jh);
 406
 407                        jbd_lock_bh_state(bh);
 408                        jbd_free(jh->b_committed_data, bh->b_size);
 409                        jh->b_committed_data = NULL;
 410                        jbd_unlock_bh_state(bh);
 411                }
 412                journal_refile_buffer(journal, jh);
 413        }
 414
 415        /*
 416         * Now try to drop any written-back buffers from the journal's
 417         * checkpoint lists.  We do this *before* commit because it potentially
 418         * frees some memory
 419         */
 420        spin_lock(&journal->j_list_lock);
 421        __journal_clean_checkpoint_list(journal);
 422        spin_unlock(&journal->j_list_lock);
 423
 424        jbd_debug (3, "JBD: commit phase 1\n");
 425
 426        /*
 427         * Switch to a new revoke table.
 428         */
 429        journal_switch_revoke_table(journal);
 430
 431        commit_transaction->t_state = T_FLUSH;
 432        journal->j_committing_transaction = commit_transaction;
 433        journal->j_running_transaction = NULL;
 434        start_time = ktime_get();
 435        commit_transaction->t_log_start = journal->j_head;
 436        wake_up(&journal->j_wait_transaction_locked);
 437        spin_unlock(&journal->j_state_lock);
 438
 439        jbd_debug (3, "JBD: commit phase 2\n");
 440
 441        /*
 442         * Now start flushing things to disk, in the order they appear
 443         * on the transaction lists.  Data blocks go first.
 444         */
 445        err = journal_submit_data_buffers(journal, commit_transaction,
 446                                          write_op);
 447
 448        /*
 449         * Wait for all previously submitted IO to complete.
 450         */
 451        spin_lock(&journal->j_list_lock);
 452        while (commit_transaction->t_locked_list) {
 453                struct buffer_head *bh;
 454
 455                jh = commit_transaction->t_locked_list->b_tprev;
 456                bh = jh2bh(jh);
 457                get_bh(bh);
 458                if (buffer_locked(bh)) {
 459                        spin_unlock(&journal->j_list_lock);
 460                        wait_on_buffer(bh);
 461                        spin_lock(&journal->j_list_lock);
 462                }
 463                if (unlikely(!buffer_uptodate(bh))) {
 464                        if (!trylock_page(bh->b_page)) {
 465                                spin_unlock(&journal->j_list_lock);
 466                                lock_page(bh->b_page);
 467                                spin_lock(&journal->j_list_lock);
 468                        }
 469                        if (bh->b_page->mapping)
 470                                set_bit(AS_EIO, &bh->b_page->mapping->flags);
 471
 472                        unlock_page(bh->b_page);
 473                        SetPageError(bh->b_page);
 474                        err = -EIO;
 475                }
 476                if (!inverted_lock(journal, bh)) {
 477                        put_bh(bh);
 478                        spin_lock(&journal->j_list_lock);
 479                        continue;
 480                }
 481                if (buffer_jbd(bh) && bh2jh(bh) == jh &&
 482                    jh->b_transaction == commit_transaction &&
 483                    jh->b_jlist == BJ_Locked) {
 484                        __journal_unfile_buffer(jh);
 485                        jbd_unlock_bh_state(bh);
 486                        journal_remove_journal_head(bh);
 487                        put_bh(bh);
 488                } else {
 489                        jbd_unlock_bh_state(bh);
 490                }
 491                release_data_buffer(bh);
 492                cond_resched_lock(&journal->j_list_lock);
 493        }
 494        spin_unlock(&journal->j_list_lock);
 495
 496        if (err) {
 497                char b[BDEVNAME_SIZE];
 498
 499                printk(KERN_WARNING
 500                        "JBD: Detected IO errors while flushing file data "
 501                        "on %s\n", bdevname(journal->j_fs_dev, b));
 502                if (journal->j_flags & JFS_ABORT_ON_SYNCDATA_ERR)
 503                        journal_abort(journal, err);
 504                err = 0;
 505        }
 506
 507        journal_write_revoke_records(journal, commit_transaction, write_op);
 508
 509        /*
 510         * If we found any dirty or locked buffers, then we should have
 511         * looped back up to the write_out_data label.  If there weren't
 512         * any then journal_clean_data_list should have wiped the list
 513         * clean by now, so check that it is in fact empty.
 514         */
 515        J_ASSERT (commit_transaction->t_sync_datalist == NULL);
 516
 517        jbd_debug (3, "JBD: commit phase 3\n");
 518
 519        /*
 520         * Way to go: we have now written out all of the data for a
 521         * transaction!  Now comes the tricky part: we need to write out
 522         * metadata.  Loop over the transaction's entire buffer list:
 523         */
 524        spin_lock(&journal->j_state_lock);
 525        commit_transaction->t_state = T_COMMIT;
 526        spin_unlock(&journal->j_state_lock);
 527
 528        J_ASSERT(commit_transaction->t_nr_buffers <=
 529                 commit_transaction->t_outstanding_credits);
 530
 531        descriptor = NULL;
 532        bufs = 0;
 533        while (commit_transaction->t_buffers) {
 534
 535                /* Find the next buffer to be journaled... */
 536
 537                jh = commit_transaction->t_buffers;
 538
 539                /* If we're in abort mode, we just un-journal the buffer and
 540                   release it. */
 541
 542                if (is_journal_aborted(journal)) {
 543                        clear_buffer_jbddirty(jh2bh(jh));
 544                        JBUFFER_TRACE(jh, "journal is aborting: refile");
 545                        journal_refile_buffer(journal, jh);
 546                        /* If that was the last one, we need to clean up
 547                         * any descriptor buffers which may have been
 548                         * already allocated, even if we are now
 549                         * aborting. */
 550                        if (!commit_transaction->t_buffers)
 551                                goto start_journal_io;
 552                        continue;
 553                }
 554
 555                /* Make sure we have a descriptor block in which to
 556                   record the metadata buffer. */
 557
 558                if (!descriptor) {
 559                        struct buffer_head *bh;
 560
 561                        J_ASSERT (bufs == 0);
 562
 563                        jbd_debug(4, "JBD: get descriptor\n");
 564
 565                        descriptor = journal_get_descriptor_buffer(journal);
 566                        if (!descriptor) {
 567                                journal_abort(journal, -EIO);
 568                                continue;
 569                        }
 570
 571                        bh = jh2bh(descriptor);
 572                        jbd_debug(4, "JBD: got buffer %llu (%p)\n",
 573                                (unsigned long long)bh->b_blocknr, bh->b_data);
 574                        header = (journal_header_t *)&bh->b_data[0];
 575                        header->h_magic     = cpu_to_be32(JFS_MAGIC_NUMBER);
 576                        header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK);
 577                        header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
 578
 579                        tagp = &bh->b_data[sizeof(journal_header_t)];
 580                        space_left = bh->b_size - sizeof(journal_header_t);
 581                        first_tag = 1;
 582                        set_buffer_jwrite(bh);
 583                        set_buffer_dirty(bh);
 584                        wbuf[bufs++] = bh;
 585
 586                        /* Record it so that we can wait for IO
 587                           completion later */
 588                        BUFFER_TRACE(bh, "ph3: file as descriptor");
 589                        journal_file_buffer(descriptor, commit_transaction,
 590                                        BJ_LogCtl);
 591                }
 592
 593                /* Where is the buffer to be written? */
 594
 595                err = journal_next_log_block(journal, &blocknr);
 596                /* If the block mapping failed, just abandon the buffer
 597                   and repeat this loop: we'll fall into the
 598                   refile-on-abort condition above. */
 599                if (err) {
 600                        journal_abort(journal, err);
 601                        continue;
 602                }
 603
 604                /*
 605                 * start_this_handle() uses t_outstanding_credits to determine
 606                 * the free space in the log, but this counter is changed
 607                 * by journal_next_log_block() also.
 608                 */
 609                commit_transaction->t_outstanding_credits--;
 610
 611                /* Bump b_count to prevent truncate from stumbling over
 612                   the shadowed buffer!  @@@ This can go if we ever get
 613                   rid of the BJ_IO/BJ_Shadow pairing of buffers. */
 614                atomic_inc(&jh2bh(jh)->b_count);
 615
 616                /* Make a temporary IO buffer with which to write it out
 617                   (this will requeue both the metadata buffer and the
 618                   temporary IO buffer). new_bh goes on BJ_IO*/
 619
 620                set_bit(BH_JWrite, &jh2bh(jh)->b_state);
 621                /*
 622                 * akpm: journal_write_metadata_buffer() sets
 623                 * new_bh->b_transaction to commit_transaction.
 624                 * We need to clean this up before we release new_bh
 625                 * (which is of type BJ_IO)
 626                 */
 627                JBUFFER_TRACE(jh, "ph3: write metadata");
 628                flags = journal_write_metadata_buffer(commit_transaction,
 629                                                      jh, &new_jh, blocknr);
 630                set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
 631                wbuf[bufs++] = jh2bh(new_jh);
 632
 633                /* Record the new block's tag in the current descriptor
 634                   buffer */
 635
 636                tag_flag = 0;
 637                if (flags & 1)
 638                        tag_flag |= JFS_FLAG_ESCAPE;
 639                if (!first_tag)
 640                        tag_flag |= JFS_FLAG_SAME_UUID;
 641
 642                tag = (journal_block_tag_t *) tagp;
 643                tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr);
 644                tag->t_flags = cpu_to_be32(tag_flag);
 645                tagp += sizeof(journal_block_tag_t);
 646                space_left -= sizeof(journal_block_tag_t);
 647
 648                if (first_tag) {
 649                        memcpy (tagp, journal->j_uuid, 16);
 650                        tagp += 16;
 651                        space_left -= 16;
 652                        first_tag = 0;
 653                }
 654
 655                /* If there's no more to do, or if the descriptor is full,
 656                   let the IO rip! */
 657
 658                if (bufs == journal->j_wbufsize ||
 659                    commit_transaction->t_buffers == NULL ||
 660                    space_left < sizeof(journal_block_tag_t) + 16) {
 661
 662                        jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
 663
 664                        /* Write an end-of-descriptor marker before
 665                           submitting the IOs.  "tag" still points to
 666                           the last tag we set up. */
 667
 668                        tag->t_flags |= cpu_to_be32(JFS_FLAG_LAST_TAG);
 669
 670start_journal_io:
 671                        for (i = 0; i < bufs; i++) {
 672                                struct buffer_head *bh = wbuf[i];
 673                                lock_buffer(bh);
 674                                clear_buffer_dirty(bh);
 675                                set_buffer_uptodate(bh);
 676                                bh->b_end_io = journal_end_buffer_io_sync;
 677                                submit_bh(write_op, bh);
 678                        }
 679                        cond_resched();
 680
 681                        /* Force a new descriptor to be generated next
 682                           time round the loop. */
 683                        descriptor = NULL;
 684                        bufs = 0;
 685                }
 686        }
 687
 688        /* Lo and behold: we have just managed to send a transaction to
 689           the log.  Before we can commit it, wait for the IO so far to
 690           complete.  Control buffers being written are on the
 691           transaction's t_log_list queue, and metadata buffers are on
 692           the t_iobuf_list queue.
 693
 694           Wait for the buffers in reverse order.  That way we are
 695           less likely to be woken up until all IOs have completed, and
 696           so we incur less scheduling load.
 697        */
 698
 699        jbd_debug(3, "JBD: commit phase 4\n");
 700
 701        /*
 702         * akpm: these are BJ_IO, and j_list_lock is not needed.
 703         * See __journal_try_to_free_buffer.
 704         */
 705wait_for_iobuf:
 706        while (commit_transaction->t_iobuf_list != NULL) {
 707                struct buffer_head *bh;
 708
 709                jh = commit_transaction->t_iobuf_list->b_tprev;
 710                bh = jh2bh(jh);
 711                if (buffer_locked(bh)) {
 712                        wait_on_buffer(bh);
 713                        goto wait_for_iobuf;
 714                }
 715                if (cond_resched())
 716                        goto wait_for_iobuf;
 717
 718                if (unlikely(!buffer_uptodate(bh)))
 719                        err = -EIO;
 720
 721                clear_buffer_jwrite(bh);
 722
 723                JBUFFER_TRACE(jh, "ph4: unfile after journal write");
 724                journal_unfile_buffer(journal, jh);
 725
 726                /*
 727                 * ->t_iobuf_list should contain only dummy buffer_heads
 728                 * which were created by journal_write_metadata_buffer().
 729                 */
 730                BUFFER_TRACE(bh, "dumping temporary bh");
 731                journal_put_journal_head(jh);
 732                __brelse(bh);
 733                J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
 734                free_buffer_head(bh);
 735
 736                /* We also have to unlock and free the corresponding
 737                   shadowed buffer */
 738                jh = commit_transaction->t_shadow_list->b_tprev;
 739                bh = jh2bh(jh);
 740                clear_bit(BH_JWrite, &bh->b_state);
 741                J_ASSERT_BH(bh, buffer_jbddirty(bh));
 742
 743                /* The metadata is now released for reuse, but we need
 744                   to remember it against this transaction so that when
 745                   we finally commit, we can do any checkpointing
 746                   required. */
 747                JBUFFER_TRACE(jh, "file as BJ_Forget");
 748                journal_file_buffer(jh, commit_transaction, BJ_Forget);
 749                /* Wake up any transactions which were waiting for this
 750                   IO to complete */
 751                wake_up_bit(&bh->b_state, BH_Unshadow);
 752                JBUFFER_TRACE(jh, "brelse shadowed buffer");
 753                __brelse(bh);
 754        }
 755
 756        J_ASSERT (commit_transaction->t_shadow_list == NULL);
 757
 758        jbd_debug(3, "JBD: commit phase 5\n");
 759
 760        /* Here we wait for the revoke record and descriptor record buffers */
 761 wait_for_ctlbuf:
 762        while (commit_transaction->t_log_list != NULL) {
 763                struct buffer_head *bh;
 764
 765                jh = commit_transaction->t_log_list->b_tprev;
 766                bh = jh2bh(jh);
 767                if (buffer_locked(bh)) {
 768                        wait_on_buffer(bh);
 769                        goto wait_for_ctlbuf;
 770                }
 771                if (cond_resched())
 772                        goto wait_for_ctlbuf;
 773
 774                if (unlikely(!buffer_uptodate(bh)))
 775                        err = -EIO;
 776
 777                BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
 778                clear_buffer_jwrite(bh);
 779                journal_unfile_buffer(journal, jh);
 780                journal_put_journal_head(jh);
 781                __brelse(bh);           /* One for getblk */
 782                /* AKPM: bforget here */
 783        }
 784
 785        if (err)
 786                journal_abort(journal, err);
 787
 788        jbd_debug(3, "JBD: commit phase 6\n");
 789
 790        if (journal_write_commit_record(journal, commit_transaction))
 791                err = -EIO;
 792
 793        if (err)
 794                journal_abort(journal, err);
 795
 796        /* End of a transaction!  Finally, we can do checkpoint
 797           processing: any buffers committed as a result of this
 798           transaction can be removed from any checkpoint list it was on
 799           before. */
 800
 801        jbd_debug(3, "JBD: commit phase 7\n");
 802
 803        J_ASSERT(commit_transaction->t_sync_datalist == NULL);
 804        J_ASSERT(commit_transaction->t_buffers == NULL);
 805        J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
 806        J_ASSERT(commit_transaction->t_iobuf_list == NULL);
 807        J_ASSERT(commit_transaction->t_shadow_list == NULL);
 808        J_ASSERT(commit_transaction->t_log_list == NULL);
 809
 810restart_loop:
 811        /*
 812         * As there are other places (journal_unmap_buffer()) adding buffers
 813         * to this list we have to be careful and hold the j_list_lock.
 814         */
 815        spin_lock(&journal->j_list_lock);
 816        while (commit_transaction->t_forget) {
 817                transaction_t *cp_transaction;
 818                struct buffer_head *bh;
 819
 820                jh = commit_transaction->t_forget;
 821                spin_unlock(&journal->j_list_lock);
 822                bh = jh2bh(jh);
 823                jbd_lock_bh_state(bh);
 824                J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
 825                        jh->b_transaction == journal->j_running_transaction);
 826
 827                /*
 828                 * If there is undo-protected committed data against
 829                 * this buffer, then we can remove it now.  If it is a
 830                 * buffer needing such protection, the old frozen_data
 831                 * field now points to a committed version of the
 832                 * buffer, so rotate that field to the new committed
 833                 * data.
 834                 *
 835                 * Otherwise, we can just throw away the frozen data now.
 836                 */
 837                if (jh->b_committed_data) {
 838                        jbd_free(jh->b_committed_data, bh->b_size);
 839                        jh->b_committed_data = NULL;
 840                        if (jh->b_frozen_data) {
 841                                jh->b_committed_data = jh->b_frozen_data;
 842                                jh->b_frozen_data = NULL;
 843                        }
 844                } else if (jh->b_frozen_data) {
 845                        jbd_free(jh->b_frozen_data, bh->b_size);
 846                        jh->b_frozen_data = NULL;
 847                }
 848
 849                spin_lock(&journal->j_list_lock);
 850                cp_transaction = jh->b_cp_transaction;
 851                if (cp_transaction) {
 852                        JBUFFER_TRACE(jh, "remove from old cp transaction");
 853                        __journal_remove_checkpoint(jh);
 854                }
 855
 856                /* Only re-checkpoint the buffer_head if it is marked
 857                 * dirty.  If the buffer was added to the BJ_Forget list
 858                 * by journal_forget, it may no longer be dirty and
 859                 * there's no point in keeping a checkpoint record for
 860                 * it. */
 861
 862                /* A buffer which has been freed while still being
 863                 * journaled by a previous transaction may end up still
 864                 * being dirty here, but we want to avoid writing back
 865                 * that buffer in the future now that the last use has
 866                 * been committed.  That's not only a performance gain,
 867                 * it also stops aliasing problems if the buffer is left
 868                 * behind for writeback and gets reallocated for another
 869                 * use in a different page. */
 870                if (buffer_freed(bh)) {
 871                        clear_buffer_freed(bh);
 872                        clear_buffer_jbddirty(bh);
 873                }
 874
 875                if (buffer_jbddirty(bh)) {
 876                        JBUFFER_TRACE(jh, "add to new checkpointing trans");
 877                        __journal_insert_checkpoint(jh, commit_transaction);
 878                        if (is_journal_aborted(journal))
 879                                clear_buffer_jbddirty(bh);
 880                        JBUFFER_TRACE(jh, "refile for checkpoint writeback");
 881                        __journal_refile_buffer(jh);
 882                        jbd_unlock_bh_state(bh);
 883                } else {
 884                        J_ASSERT_BH(bh, !buffer_dirty(bh));
 885                        /* The buffer on BJ_Forget list and not jbddirty means
 886                         * it has been freed by this transaction and hence it
 887                         * could not have been reallocated until this
 888                         * transaction has committed. *BUT* it could be
 889                         * reallocated once we have written all the data to
 890                         * disk and before we process the buffer on BJ_Forget
 891                         * list. */
 892                        JBUFFER_TRACE(jh, "refile or unfile freed buffer");
 893                        __journal_refile_buffer(jh);
 894                        if (!jh->b_transaction) {
 895                                jbd_unlock_bh_state(bh);
 896                                 /* needs a brelse */
 897                                journal_remove_journal_head(bh);
 898                                release_buffer_page(bh);
 899                        } else
 900                                jbd_unlock_bh_state(bh);
 901                }
 902                cond_resched_lock(&journal->j_list_lock);
 903        }
 904        spin_unlock(&journal->j_list_lock);
 905        /*
 906         * This is a bit sleazy.  We use j_list_lock to protect transition
 907         * of a transaction into T_FINISHED state and calling
 908         * __journal_drop_transaction(). Otherwise we could race with
 909         * other checkpointing code processing the transaction...
 910         */
 911        spin_lock(&journal->j_state_lock);
 912        spin_lock(&journal->j_list_lock);
 913        /*
 914         * Now recheck if some buffers did not get attached to the transaction
 915         * while the lock was dropped...
 916         */
 917        if (commit_transaction->t_forget) {
 918                spin_unlock(&journal->j_list_lock);
 919                spin_unlock(&journal->j_state_lock);
 920                goto restart_loop;
 921        }
 922
 923        /* Done with this transaction! */
 924
 925        jbd_debug(3, "JBD: commit phase 8\n");
 926
 927        J_ASSERT(commit_transaction->t_state == T_COMMIT);
 928
 929        commit_transaction->t_state = T_FINISHED;
 930        J_ASSERT(commit_transaction == journal->j_committing_transaction);
 931        journal->j_commit_sequence = commit_transaction->t_tid;
 932        journal->j_committing_transaction = NULL;
 933        commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
 934
 935        /*
 936         * weight the commit time higher than the average time so we don't
 937         * react too strongly to vast changes in commit time
 938         */
 939        if (likely(journal->j_average_commit_time))
 940                journal->j_average_commit_time = (commit_time*3 +
 941                                journal->j_average_commit_time) / 4;
 942        else
 943                journal->j_average_commit_time = commit_time;
 944
 945        spin_unlock(&journal->j_state_lock);
 946
 947        if (commit_transaction->t_checkpoint_list == NULL &&
 948            commit_transaction->t_checkpoint_io_list == NULL) {
 949                __journal_drop_transaction(journal, commit_transaction);
 950        } else {
 951                if (journal->j_checkpoint_transactions == NULL) {
 952                        journal->j_checkpoint_transactions = commit_transaction;
 953                        commit_transaction->t_cpnext = commit_transaction;
 954                        commit_transaction->t_cpprev = commit_transaction;
 955                } else {
 956                        commit_transaction->t_cpnext =
 957                                journal->j_checkpoint_transactions;
 958                        commit_transaction->t_cpprev =
 959                                commit_transaction->t_cpnext->t_cpprev;
 960                        commit_transaction->t_cpnext->t_cpprev =
 961                                commit_transaction;
 962                        commit_transaction->t_cpprev->t_cpnext =
 963                                commit_transaction;
 964                }
 965        }
 966        spin_unlock(&journal->j_list_lock);
 967
 968        jbd_debug(1, "JBD: commit %d complete, head %d\n",
 969                  journal->j_commit_sequence, journal->j_tail_sequence);
 970
 971        wake_up(&journal->j_wait_done_commit);
 972}
 973