linux/fs/jbd2/commit.c
<<
>>
Prefs
   1/*
   2 * linux/fs/jbd2/commit.c
   3 *
   4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
   5 *
   6 * Copyright 1998 Red Hat corp --- All Rights Reserved
   7 *
   8 * This file is part of the Linux kernel and is made available under
   9 * the terms of the GNU General Public License, version 2, or at your
  10 * option, any later version, incorporated herein by reference.
  11 *
  12 * Journal commit routines for the generic filesystem journaling code;
  13 * part of the ext2fs journaling system.
  14 */
  15
  16#include <linux/time.h>
  17#include <linux/fs.h>
  18#include <linux/jbd2.h>
  19#include <linux/errno.h>
  20#include <linux/slab.h>
  21#include <linux/mm.h>
  22#include <linux/pagemap.h>
  23#include <linux/jiffies.h>
  24#include <linux/crc32.h>
  25#include <linux/writeback.h>
  26#include <linux/backing-dev.h>
  27#include <linux/bio.h>
  28#include <linux/blkdev.h>
  29#include <linux/bitops.h>
  30#include <trace/events/jbd2.h>
  31
  32/*
  33 * Default IO end handler for temporary BJ_IO buffer_heads.
  34 */
  35static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
  36{
  37        BUFFER_TRACE(bh, "");
  38        if (uptodate)
  39                set_buffer_uptodate(bh);
  40        else
  41                clear_buffer_uptodate(bh);
  42        unlock_buffer(bh);
  43}
  44
  45/*
  46 * When an ext4 file is truncated, it is possible that some pages are not
  47 * successfully freed, because they are attached to a committing transaction.
  48 * After the transaction commits, these pages are left on the LRU, with no
  49 * ->mapping, and with attached buffers.  These pages are trivially reclaimable
  50 * by the VM, but their apparent absence upsets the VM accounting, and it makes
  51 * the numbers in /proc/meminfo look odd.
  52 *
  53 * So here, we have a buffer which has just come off the forget list.  Look to
  54 * see if we can strip all buffers from the backing page.
  55 *
  56 * Called under lock_journal(), and possibly under journal_datalist_lock.  The
  57 * caller provided us with a ref against the buffer, and we drop that here.
  58 */
  59static void release_buffer_page(struct buffer_head *bh)
  60{
  61        struct page *page;
  62
  63        if (buffer_dirty(bh))
  64                goto nope;
  65        if (atomic_read(&bh->b_count) != 1)
  66                goto nope;
  67        page = bh->b_page;
  68        if (!page)
  69                goto nope;
  70        if (page->mapping)
  71                goto nope;
  72
  73        /* OK, it's a truncated page */
  74        if (!trylock_page(page))
  75                goto nope;
  76
  77        page_cache_get(page);
  78        __brelse(bh);
  79        try_to_free_buffers(page);
  80        unlock_page(page);
  81        page_cache_release(page);
  82        return;
  83
  84nope:
  85        __brelse(bh);
  86}
  87
  88static void jbd2_commit_block_csum_set(journal_t *j,
  89                                       struct journal_head *descriptor)
  90{
  91        struct commit_header *h;
  92        __u32 csum;
  93
  94        if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
  95                return;
  96
  97        h = (struct commit_header *)(jh2bh(descriptor)->b_data);
  98        h->h_chksum_type = 0;
  99        h->h_chksum_size = 0;
 100        h->h_chksum[0] = 0;
 101        csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data,
 102                           j->j_blocksize);
 103        h->h_chksum[0] = cpu_to_be32(csum);
 104}
 105
 106/*
 107 * Done it all: now submit the commit record.  We should have
 108 * cleaned up our previous buffers by now, so if we are in abort
 109 * mode we can now just skip the rest of the journal write
 110 * entirely.
 111 *
 112 * Returns 1 if the journal needs to be aborted or 0 on success
 113 */
 114static int journal_submit_commit_record(journal_t *journal,
 115                                        transaction_t *commit_transaction,
 116                                        struct buffer_head **cbh,
 117                                        __u32 crc32_sum)
 118{
 119        struct journal_head *descriptor;
 120        struct commit_header *tmp;
 121        struct buffer_head *bh;
 122        int ret;
 123        struct timespec now = current_kernel_time();
 124
 125        *cbh = NULL;
 126
 127        if (is_journal_aborted(journal))
 128                return 0;
 129
 130        descriptor = jbd2_journal_get_descriptor_buffer(journal);
 131        if (!descriptor)
 132                return 1;
 133
 134        bh = jh2bh(descriptor);
 135
 136        tmp = (struct commit_header *)bh->b_data;
 137        tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
 138        tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
 139        tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
 140        tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
 141        tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
 142
 143        if (JBD2_HAS_COMPAT_FEATURE(journal,
 144                                    JBD2_FEATURE_COMPAT_CHECKSUM)) {
 145                tmp->h_chksum_type      = JBD2_CRC32_CHKSUM;
 146                tmp->h_chksum_size      = JBD2_CRC32_CHKSUM_SIZE;
 147                tmp->h_chksum[0]        = cpu_to_be32(crc32_sum);
 148        }
 149        jbd2_commit_block_csum_set(journal, descriptor);
 150
 151        JBUFFER_TRACE(descriptor, "submit commit block");
 152        lock_buffer(bh);
 153        clear_buffer_dirty(bh);
 154        set_buffer_uptodate(bh);
 155        bh->b_end_io = journal_end_buffer_io_sync;
 156
 157        if (journal->j_flags & JBD2_BARRIER &&
 158            !JBD2_HAS_INCOMPAT_FEATURE(journal,
 159                                       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
 160                ret = submit_bh(WRITE_SYNC | WRITE_FLUSH_FUA, bh);
 161        else
 162                ret = submit_bh(WRITE_SYNC, bh);
 163
 164        *cbh = bh;
 165        return ret;
 166}
 167
 168/*
 169 * This function along with journal_submit_commit_record
 170 * allows to write the commit record asynchronously.
 171 */
 172static int journal_wait_on_commit_record(journal_t *journal,
 173                                         struct buffer_head *bh)
 174{
 175        int ret = 0;
 176
 177        clear_buffer_dirty(bh);
 178        wait_on_buffer(bh);
 179
 180        if (unlikely(!buffer_uptodate(bh)))
 181                ret = -EIO;
 182        put_bh(bh);            /* One for getblk() */
 183        jbd2_journal_put_journal_head(bh2jh(bh));
 184
 185        return ret;
 186}
 187
 188/*
 189 * write the filemap data using writepage() address_space_operations.
 190 * We don't do block allocation here even for delalloc. We don't
 191 * use writepages() because with dealyed allocation we may be doing
 192 * block allocation in writepages().
 193 */
 194static int journal_submit_inode_data_buffers(struct address_space *mapping)
 195{
 196        int ret;
 197        struct writeback_control wbc = {
 198                .sync_mode =  WB_SYNC_ALL,
 199                .nr_to_write = mapping->nrpages * 2,
 200                .range_start = 0,
 201                .range_end = i_size_read(mapping->host),
 202        };
 203
 204        ret = generic_writepages(mapping, &wbc);
 205        return ret;
 206}
 207
 208/*
 209 * Submit all the data buffers of inode associated with the transaction to
 210 * disk.
 211 *
 212 * We are in a committing transaction. Therefore no new inode can be added to
 213 * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
 214 * operate on from being released while we write out pages.
 215 */
 216static int journal_submit_data_buffers(journal_t *journal,
 217                transaction_t *commit_transaction)
 218{
 219        struct jbd2_inode *jinode;
 220        int err, ret = 0;
 221        struct address_space *mapping;
 222
 223        spin_lock(&journal->j_list_lock);
 224        list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
 225                mapping = jinode->i_vfs_inode->i_mapping;
 226                set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
 227                spin_unlock(&journal->j_list_lock);
 228                /*
 229                 * submit the inode data buffers. We use writepage
 230                 * instead of writepages. Because writepages can do
 231                 * block allocation  with delalloc. We need to write
 232                 * only allocated blocks here.
 233                 */
 234                trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
 235                err = journal_submit_inode_data_buffers(mapping);
 236                if (!ret)
 237                        ret = err;
 238                spin_lock(&journal->j_list_lock);
 239                J_ASSERT(jinode->i_transaction == commit_transaction);
 240                clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
 241                smp_mb__after_clear_bit();
 242                wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
 243        }
 244        spin_unlock(&journal->j_list_lock);
 245        return ret;
 246}
 247
 248/*
 249 * Wait for data submitted for writeout, refile inodes to proper
 250 * transaction if needed.
 251 *
 252 */
 253static int journal_finish_inode_data_buffers(journal_t *journal,
 254                transaction_t *commit_transaction)
 255{
 256        struct jbd2_inode *jinode, *next_i;
 257        int err, ret = 0;
 258
 259        /* For locking, see the comment in journal_submit_data_buffers() */
 260        spin_lock(&journal->j_list_lock);
 261        list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
 262                set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
 263                spin_unlock(&journal->j_list_lock);
 264                err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
 265                if (err) {
 266                        /*
 267                         * Because AS_EIO is cleared by
 268                         * filemap_fdatawait_range(), set it again so
 269                         * that user process can get -EIO from fsync().
 270                         */
 271                        set_bit(AS_EIO,
 272                                &jinode->i_vfs_inode->i_mapping->flags);
 273
 274                        if (!ret)
 275                                ret = err;
 276                }
 277                spin_lock(&journal->j_list_lock);
 278                clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
 279                smp_mb__after_clear_bit();
 280                wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
 281        }
 282
 283        /* Now refile inode to proper lists */
 284        list_for_each_entry_safe(jinode, next_i,
 285                                 &commit_transaction->t_inode_list, i_list) {
 286                list_del(&jinode->i_list);
 287                if (jinode->i_next_transaction) {
 288                        jinode->i_transaction = jinode->i_next_transaction;
 289                        jinode->i_next_transaction = NULL;
 290                        list_add(&jinode->i_list,
 291                                &jinode->i_transaction->t_inode_list);
 292                } else {
 293                        jinode->i_transaction = NULL;
 294                }
 295        }
 296        spin_unlock(&journal->j_list_lock);
 297
 298        return ret;
 299}
 300
 301static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
 302{
 303        struct page *page = bh->b_page;
 304        char *addr;
 305        __u32 checksum;
 306
 307        addr = kmap_atomic(page);
 308        checksum = crc32_be(crc32_sum,
 309                (void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
 310        kunmap_atomic(addr);
 311
 312        return checksum;
 313}
 314
 315static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
 316                                   unsigned long long block)
 317{
 318        tag->t_blocknr = cpu_to_be32(block & (u32)~0);
 319        if (tag_bytes > JBD2_TAG_SIZE32)
 320                tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
 321}
 322
 323static void jbd2_descr_block_csum_set(journal_t *j,
 324                                      struct journal_head *descriptor)
 325{
 326        struct jbd2_journal_block_tail *tail;
 327        __u32 csum;
 328
 329        if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
 330                return;
 331
 332        tail = (struct jbd2_journal_block_tail *)
 333                        (jh2bh(descriptor)->b_data + j->j_blocksize -
 334                        sizeof(struct jbd2_journal_block_tail));
 335        tail->t_checksum = 0;
 336        csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data,
 337                           j->j_blocksize);
 338        tail->t_checksum = cpu_to_be32(csum);
 339}
 340
 341static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
 342                                    struct buffer_head *bh, __u32 sequence)
 343{
 344        struct page *page = bh->b_page;
 345        __u8 *addr;
 346        __u32 csum;
 347
 348        if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
 349                return;
 350
 351        sequence = cpu_to_be32(sequence);
 352        addr = kmap_atomic(page);
 353        csum = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence,
 354                          sizeof(sequence));
 355        csum = jbd2_chksum(j, csum, addr + offset_in_page(bh->b_data),
 356                          bh->b_size);
 357        kunmap_atomic(addr);
 358
 359        tag->t_checksum = cpu_to_be32(csum);
 360}
 361/*
 362 * jbd2_journal_commit_transaction
 363 *
 364 * The primary function for committing a transaction to the log.  This
 365 * function is called by the journal thread to begin a complete commit.
 366 */
 367void jbd2_journal_commit_transaction(journal_t *journal)
 368{
 369        struct transaction_stats_s stats;
 370        transaction_t *commit_transaction;
 371        struct journal_head *jh, *new_jh, *descriptor;
 372        struct buffer_head **wbuf = journal->j_wbuf;
 373        int bufs;
 374        int flags;
 375        int err;
 376        unsigned long long blocknr;
 377        ktime_t start_time;
 378        u64 commit_time;
 379        char *tagp = NULL;
 380        journal_header_t *header;
 381        journal_block_tag_t *tag = NULL;
 382        int space_left = 0;
 383        int first_tag = 0;
 384        int tag_flag;
 385        int i;
 386        int tag_bytes = journal_tag_bytes(journal);
 387        struct buffer_head *cbh = NULL; /* For transactional checksums */
 388        __u32 crc32_sum = ~0;
 389        struct blk_plug plug;
 390        /* Tail of the journal */
 391        unsigned long first_block;
 392        tid_t first_tid;
 393        int update_tail;
 394        int csum_size = 0;
 395
 396        if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
 397                csum_size = sizeof(struct jbd2_journal_block_tail);
 398
 399        /*
 400         * First job: lock down the current transaction and wait for
 401         * all outstanding updates to complete.
 402         */
 403
 404        /* Do we need to erase the effects of a prior jbd2_journal_flush? */
 405        if (journal->j_flags & JBD2_FLUSHED) {
 406                jbd_debug(3, "super block updated\n");
 407                mutex_lock(&journal->j_checkpoint_mutex);
 408                /*
 409                 * We hold j_checkpoint_mutex so tail cannot change under us.
 410                 * We don't need any special data guarantees for writing sb
 411                 * since journal is empty and it is ok for write to be
 412                 * flushed only with transaction commit.
 413                 */
 414                jbd2_journal_update_sb_log_tail(journal,
 415                                                journal->j_tail_sequence,
 416                                                journal->j_tail,
 417                                                WRITE_SYNC);
 418                mutex_unlock(&journal->j_checkpoint_mutex);
 419        } else {
 420                jbd_debug(3, "superblock not updated\n");
 421        }
 422
 423        J_ASSERT(journal->j_running_transaction != NULL);
 424        J_ASSERT(journal->j_committing_transaction == NULL);
 425
 426        commit_transaction = journal->j_running_transaction;
 427        J_ASSERT(commit_transaction->t_state == T_RUNNING);
 428
 429        trace_jbd2_start_commit(journal, commit_transaction);
 430        jbd_debug(1, "JBD2: starting commit of transaction %d\n",
 431                        commit_transaction->t_tid);
 432
 433        write_lock(&journal->j_state_lock);
 434        commit_transaction->t_state = T_LOCKED;
 435
 436        trace_jbd2_commit_locking(journal, commit_transaction);
 437        stats.run.rs_wait = commit_transaction->t_max_wait;
 438        stats.run.rs_request_delay = 0;
 439        stats.run.rs_locked = jiffies;
 440        if (commit_transaction->t_requested)
 441                stats.run.rs_request_delay =
 442                        jbd2_time_diff(commit_transaction->t_requested,
 443                                       stats.run.rs_locked);
 444        stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
 445                                              stats.run.rs_locked);
 446
 447        spin_lock(&commit_transaction->t_handle_lock);
 448        while (atomic_read(&commit_transaction->t_updates)) {
 449                DEFINE_WAIT(wait);
 450
 451                prepare_to_wait(&journal->j_wait_updates, &wait,
 452                                        TASK_UNINTERRUPTIBLE);
 453                if (atomic_read(&commit_transaction->t_updates)) {
 454                        spin_unlock(&commit_transaction->t_handle_lock);
 455                        write_unlock(&journal->j_state_lock);
 456                        schedule();
 457                        write_lock(&journal->j_state_lock);
 458                        spin_lock(&commit_transaction->t_handle_lock);
 459                }
 460                finish_wait(&journal->j_wait_updates, &wait);
 461        }
 462        spin_unlock(&commit_transaction->t_handle_lock);
 463
 464        J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
 465                        journal->j_max_transaction_buffers);
 466
 467        /*
 468         * First thing we are allowed to do is to discard any remaining
 469         * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
 470         * that there are no such buffers: if a large filesystem
 471         * operation like a truncate needs to split itself over multiple
 472         * transactions, then it may try to do a jbd2_journal_restart() while
 473         * there are still BJ_Reserved buffers outstanding.  These must
 474         * be released cleanly from the current transaction.
 475         *
 476         * In this case, the filesystem must still reserve write access
 477         * again before modifying the buffer in the new transaction, but
 478         * we do not require it to remember exactly which old buffers it
 479         * has reserved.  This is consistent with the existing behaviour
 480         * that multiple jbd2_journal_get_write_access() calls to the same
 481         * buffer are perfectly permissible.
 482         */
 483        while (commit_transaction->t_reserved_list) {
 484                jh = commit_transaction->t_reserved_list;
 485                JBUFFER_TRACE(jh, "reserved, unused: refile");
 486                /*
 487                 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
 488                 * leave undo-committed data.
 489                 */
 490                if (jh->b_committed_data) {
 491                        struct buffer_head *bh = jh2bh(jh);
 492
 493                        jbd_lock_bh_state(bh);
 494                        jbd2_free(jh->b_committed_data, bh->b_size);
 495                        jh->b_committed_data = NULL;
 496                        jbd_unlock_bh_state(bh);
 497                }
 498                jbd2_journal_refile_buffer(journal, jh);
 499        }
 500
 501        /*
 502         * Now try to drop any written-back buffers from the journal's
 503         * checkpoint lists.  We do this *before* commit because it potentially
 504         * frees some memory
 505         */
 506        spin_lock(&journal->j_list_lock);
 507        __jbd2_journal_clean_checkpoint_list(journal);
 508        spin_unlock(&journal->j_list_lock);
 509
 510        jbd_debug(3, "JBD2: commit phase 1\n");
 511
 512        /*
 513         * Clear revoked flag to reflect there is no revoked buffers
 514         * in the next transaction which is going to be started.
 515         */
 516        jbd2_clear_buffer_revoked_flags(journal);
 517
 518        /*
 519         * Switch to a new revoke table.
 520         */
 521        jbd2_journal_switch_revoke_table(journal);
 522
 523        trace_jbd2_commit_flushing(journal, commit_transaction);
 524        stats.run.rs_flushing = jiffies;
 525        stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
 526                                             stats.run.rs_flushing);
 527
 528        commit_transaction->t_state = T_FLUSH;
 529        journal->j_committing_transaction = commit_transaction;
 530        journal->j_running_transaction = NULL;
 531        start_time = ktime_get();
 532        commit_transaction->t_log_start = journal->j_head;
 533        wake_up(&journal->j_wait_transaction_locked);
 534        write_unlock(&journal->j_state_lock);
 535
 536        jbd_debug(3, "JBD2: commit phase 2\n");
 537
 538        /*
 539         * Now start flushing things to disk, in the order they appear
 540         * on the transaction lists.  Data blocks go first.
 541         */
 542        err = journal_submit_data_buffers(journal, commit_transaction);
 543        if (err)
 544                jbd2_journal_abort(journal, err);
 545
 546        blk_start_plug(&plug);
 547        jbd2_journal_write_revoke_records(journal, commit_transaction,
 548                                          WRITE_SYNC);
 549        blk_finish_plug(&plug);
 550
 551        jbd_debug(3, "JBD2: commit phase 2\n");
 552
 553        /*
 554         * Way to go: we have now written out all of the data for a
 555         * transaction!  Now comes the tricky part: we need to write out
 556         * metadata.  Loop over the transaction's entire buffer list:
 557         */
 558        write_lock(&journal->j_state_lock);
 559        commit_transaction->t_state = T_COMMIT;
 560        write_unlock(&journal->j_state_lock);
 561
 562        trace_jbd2_commit_logging(journal, commit_transaction);
 563        stats.run.rs_logging = jiffies;
 564        stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
 565                                               stats.run.rs_logging);
 566        stats.run.rs_blocks =
 567                atomic_read(&commit_transaction->t_outstanding_credits);
 568        stats.run.rs_blocks_logged = 0;
 569
 570        J_ASSERT(commit_transaction->t_nr_buffers <=
 571                 atomic_read(&commit_transaction->t_outstanding_credits));
 572
 573        err = 0;
 574        descriptor = NULL;
 575        bufs = 0;
 576        blk_start_plug(&plug);
 577        while (commit_transaction->t_buffers) {
 578
 579                /* Find the next buffer to be journaled... */
 580
 581                jh = commit_transaction->t_buffers;
 582
 583                /* If we're in abort mode, we just un-journal the buffer and
 584                   release it. */
 585
 586                if (is_journal_aborted(journal)) {
 587                        clear_buffer_jbddirty(jh2bh(jh));
 588                        JBUFFER_TRACE(jh, "journal is aborting: refile");
 589                        jbd2_buffer_abort_trigger(jh,
 590                                                  jh->b_frozen_data ?
 591                                                  jh->b_frozen_triggers :
 592                                                  jh->b_triggers);
 593                        jbd2_journal_refile_buffer(journal, jh);
 594                        /* If that was the last one, we need to clean up
 595                         * any descriptor buffers which may have been
 596                         * already allocated, even if we are now
 597                         * aborting. */
 598                        if (!commit_transaction->t_buffers)
 599                                goto start_journal_io;
 600                        continue;
 601                }
 602
 603                /* Make sure we have a descriptor block in which to
 604                   record the metadata buffer. */
 605
 606                if (!descriptor) {
 607                        struct buffer_head *bh;
 608
 609                        J_ASSERT (bufs == 0);
 610
 611                        jbd_debug(4, "JBD2: get descriptor\n");
 612
 613                        descriptor = jbd2_journal_get_descriptor_buffer(journal);
 614                        if (!descriptor) {
 615                                jbd2_journal_abort(journal, -EIO);
 616                                continue;
 617                        }
 618
 619                        bh = jh2bh(descriptor);
 620                        jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
 621                                (unsigned long long)bh->b_blocknr, bh->b_data);
 622                        header = (journal_header_t *)&bh->b_data[0];
 623                        header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
 624                        header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
 625                        header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
 626
 627                        tagp = &bh->b_data[sizeof(journal_header_t)];
 628                        space_left = bh->b_size - sizeof(journal_header_t);
 629                        first_tag = 1;
 630                        set_buffer_jwrite(bh);
 631                        set_buffer_dirty(bh);
 632                        wbuf[bufs++] = bh;
 633
 634                        /* Record it so that we can wait for IO
 635                           completion later */
 636                        BUFFER_TRACE(bh, "ph3: file as descriptor");
 637                        jbd2_journal_file_buffer(descriptor, commit_transaction,
 638                                        BJ_LogCtl);
 639                }
 640
 641                /* Where is the buffer to be written? */
 642
 643                err = jbd2_journal_next_log_block(journal, &blocknr);
 644                /* If the block mapping failed, just abandon the buffer
 645                   and repeat this loop: we'll fall into the
 646                   refile-on-abort condition above. */
 647                if (err) {
 648                        jbd2_journal_abort(journal, err);
 649                        continue;
 650                }
 651
 652                /*
 653                 * start_this_handle() uses t_outstanding_credits to determine
 654                 * the free space in the log, but this counter is changed
 655                 * by jbd2_journal_next_log_block() also.
 656                 */
 657                atomic_dec(&commit_transaction->t_outstanding_credits);
 658
 659                /* Bump b_count to prevent truncate from stumbling over
 660                   the shadowed buffer!  @@@ This can go if we ever get
 661                   rid of the BJ_IO/BJ_Shadow pairing of buffers. */
 662                atomic_inc(&jh2bh(jh)->b_count);
 663
 664                /* Make a temporary IO buffer with which to write it out
 665                   (this will requeue both the metadata buffer and the
 666                   temporary IO buffer). new_bh goes on BJ_IO*/
 667
 668                set_bit(BH_JWrite, &jh2bh(jh)->b_state);
 669                /*
 670                 * akpm: jbd2_journal_write_metadata_buffer() sets
 671                 * new_bh->b_transaction to commit_transaction.
 672                 * We need to clean this up before we release new_bh
 673                 * (which is of type BJ_IO)
 674                 */
 675                JBUFFER_TRACE(jh, "ph3: write metadata");
 676                flags = jbd2_journal_write_metadata_buffer(commit_transaction,
 677                                                      jh, &new_jh, blocknr);
 678                if (flags < 0) {
 679                        jbd2_journal_abort(journal, flags);
 680                        continue;
 681                }
 682                set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
 683                wbuf[bufs++] = jh2bh(new_jh);
 684
 685                /* Record the new block's tag in the current descriptor
 686                   buffer */
 687
 688                tag_flag = 0;
 689                if (flags & 1)
 690                        tag_flag |= JBD2_FLAG_ESCAPE;
 691                if (!first_tag)
 692                        tag_flag |= JBD2_FLAG_SAME_UUID;
 693
 694                tag = (journal_block_tag_t *) tagp;
 695                write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
 696                tag->t_flags = cpu_to_be16(tag_flag);
 697                jbd2_block_tag_csum_set(journal, tag, jh2bh(new_jh),
 698                                        commit_transaction->t_tid);
 699                tagp += tag_bytes;
 700                space_left -= tag_bytes;
 701
 702                if (first_tag) {
 703                        memcpy (tagp, journal->j_uuid, 16);
 704                        tagp += 16;
 705                        space_left -= 16;
 706                        first_tag = 0;
 707                }
 708
 709                /* If there's no more to do, or if the descriptor is full,
 710                   let the IO rip! */
 711
 712                if (bufs == journal->j_wbufsize ||
 713                    commit_transaction->t_buffers == NULL ||
 714                    space_left < tag_bytes + 16 + csum_size) {
 715
 716                        jbd_debug(4, "JBD2: Submit %d IOs\n", bufs);
 717
 718                        /* Write an end-of-descriptor marker before
 719                           submitting the IOs.  "tag" still points to
 720                           the last tag we set up. */
 721
 722                        tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG);
 723
 724                        jbd2_descr_block_csum_set(journal, descriptor);
 725start_journal_io:
 726                        for (i = 0; i < bufs; i++) {
 727                                struct buffer_head *bh = wbuf[i];
 728                                /*
 729                                 * Compute checksum.
 730                                 */
 731                                if (JBD2_HAS_COMPAT_FEATURE(journal,
 732                                        JBD2_FEATURE_COMPAT_CHECKSUM)) {
 733                                        crc32_sum =
 734                                            jbd2_checksum_data(crc32_sum, bh);
 735                                }
 736
 737                                lock_buffer(bh);
 738                                clear_buffer_dirty(bh);
 739                                set_buffer_uptodate(bh);
 740                                bh->b_end_io = journal_end_buffer_io_sync;
 741                                submit_bh(WRITE_SYNC, bh);
 742                        }
 743                        cond_resched();
 744                        stats.run.rs_blocks_logged += bufs;
 745
 746                        /* Force a new descriptor to be generated next
 747                           time round the loop. */
 748                        descriptor = NULL;
 749                        bufs = 0;
 750                }
 751        }
 752
 753        err = journal_finish_inode_data_buffers(journal, commit_transaction);
 754        if (err) {
 755                printk(KERN_WARNING
 756                        "JBD2: Detected IO errors while flushing file data "
 757                       "on %s\n", journal->j_devname);
 758                if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
 759                        jbd2_journal_abort(journal, err);
 760                err = 0;
 761        }
 762
 763        /*
 764         * Get current oldest transaction in the log before we issue flush
 765         * to the filesystem device. After the flush we can be sure that
 766         * blocks of all older transactions are checkpointed to persistent
 767         * storage and we will be safe to update journal start in the
 768         * superblock with the numbers we get here.
 769         */
 770        update_tail =
 771                jbd2_journal_get_log_tail(journal, &first_tid, &first_block);
 772
 773        write_lock(&journal->j_state_lock);
 774        if (update_tail) {
 775                long freed = first_block - journal->j_tail;
 776
 777                if (first_block < journal->j_tail)
 778                        freed += journal->j_last - journal->j_first;
 779                /* Update tail only if we free significant amount of space */
 780                if (freed < journal->j_maxlen / 4)
 781                        update_tail = 0;
 782        }
 783        J_ASSERT(commit_transaction->t_state == T_COMMIT);
 784        commit_transaction->t_state = T_COMMIT_DFLUSH;
 785        write_unlock(&journal->j_state_lock);
 786
 787        /* 
 788         * If the journal is not located on the file system device,
 789         * then we must flush the file system device before we issue
 790         * the commit record
 791         */
 792        if (commit_transaction->t_need_data_flush &&
 793            (journal->j_fs_dev != journal->j_dev) &&
 794            (journal->j_flags & JBD2_BARRIER))
 795                blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL);
 796
 797        /* Done it all: now write the commit record asynchronously. */
 798        if (JBD2_HAS_INCOMPAT_FEATURE(journal,
 799                                      JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
 800                err = journal_submit_commit_record(journal, commit_transaction,
 801                                                 &cbh, crc32_sum);
 802                if (err)
 803                        __jbd2_journal_abort_hard(journal);
 804        }
 805
 806        blk_finish_plug(&plug);
 807
 808        /* Lo and behold: we have just managed to send a transaction to
 809           the log.  Before we can commit it, wait for the IO so far to
 810           complete.  Control buffers being written are on the
 811           transaction's t_log_list queue, and metadata buffers are on
 812           the t_iobuf_list queue.
 813
 814           Wait for the buffers in reverse order.  That way we are
 815           less likely to be woken up until all IOs have completed, and
 816           so we incur less scheduling load.
 817        */
 818
 819        jbd_debug(3, "JBD2: commit phase 3\n");
 820
 821        /*
 822         * akpm: these are BJ_IO, and j_list_lock is not needed.
 823         * See __journal_try_to_free_buffer.
 824         */
 825wait_for_iobuf:
 826        while (commit_transaction->t_iobuf_list != NULL) {
 827                struct buffer_head *bh;
 828
 829                jh = commit_transaction->t_iobuf_list->b_tprev;
 830                bh = jh2bh(jh);
 831                if (buffer_locked(bh)) {
 832                        wait_on_buffer(bh);
 833                        goto wait_for_iobuf;
 834                }
 835                if (cond_resched())
 836                        goto wait_for_iobuf;
 837
 838                if (unlikely(!buffer_uptodate(bh)))
 839                        err = -EIO;
 840
 841                clear_buffer_jwrite(bh);
 842
 843                JBUFFER_TRACE(jh, "ph4: unfile after journal write");
 844                jbd2_journal_unfile_buffer(journal, jh);
 845
 846                /*
 847                 * ->t_iobuf_list should contain only dummy buffer_heads
 848                 * which were created by jbd2_journal_write_metadata_buffer().
 849                 */
 850                BUFFER_TRACE(bh, "dumping temporary bh");
 851                jbd2_journal_put_journal_head(jh);
 852                __brelse(bh);
 853                J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
 854                free_buffer_head(bh);
 855
 856                /* We also have to unlock and free the corresponding
 857                   shadowed buffer */
 858                jh = commit_transaction->t_shadow_list->b_tprev;
 859                bh = jh2bh(jh);
 860                clear_bit(BH_JWrite, &bh->b_state);
 861                J_ASSERT_BH(bh, buffer_jbddirty(bh));
 862
 863                /* The metadata is now released for reuse, but we need
 864                   to remember it against this transaction so that when
 865                   we finally commit, we can do any checkpointing
 866                   required. */
 867                JBUFFER_TRACE(jh, "file as BJ_Forget");
 868                jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
 869                /*
 870                 * Wake up any transactions which were waiting for this IO to
 871                 * complete. The barrier must be here so that changes by
 872                 * jbd2_journal_file_buffer() take effect before wake_up_bit()
 873                 * does the waitqueue check.
 874                 */
 875                smp_mb();
 876                wake_up_bit(&bh->b_state, BH_Unshadow);
 877                JBUFFER_TRACE(jh, "brelse shadowed buffer");
 878                __brelse(bh);
 879        }
 880
 881        J_ASSERT (commit_transaction->t_shadow_list == NULL);
 882
 883        jbd_debug(3, "JBD2: commit phase 4\n");
 884
 885        /* Here we wait for the revoke record and descriptor record buffers */
 886 wait_for_ctlbuf:
 887        while (commit_transaction->t_log_list != NULL) {
 888                struct buffer_head *bh;
 889
 890                jh = commit_transaction->t_log_list->b_tprev;
 891                bh = jh2bh(jh);
 892                if (buffer_locked(bh)) {
 893                        wait_on_buffer(bh);
 894                        goto wait_for_ctlbuf;
 895                }
 896                if (cond_resched())
 897                        goto wait_for_ctlbuf;
 898
 899                if (unlikely(!buffer_uptodate(bh)))
 900                        err = -EIO;
 901
 902                BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
 903                clear_buffer_jwrite(bh);
 904                jbd2_journal_unfile_buffer(journal, jh);
 905                jbd2_journal_put_journal_head(jh);
 906                __brelse(bh);           /* One for getblk */
 907                /* AKPM: bforget here */
 908        }
 909
 910        if (err)
 911                jbd2_journal_abort(journal, err);
 912
 913        jbd_debug(3, "JBD2: commit phase 5\n");
 914        write_lock(&journal->j_state_lock);
 915        J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
 916        commit_transaction->t_state = T_COMMIT_JFLUSH;
 917        write_unlock(&journal->j_state_lock);
 918
 919        if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
 920                                       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
 921                err = journal_submit_commit_record(journal, commit_transaction,
 922                                                &cbh, crc32_sum);
 923                if (err)
 924                        __jbd2_journal_abort_hard(journal);
 925        }
 926        if (cbh)
 927                err = journal_wait_on_commit_record(journal, cbh);
 928        if (JBD2_HAS_INCOMPAT_FEATURE(journal,
 929                                      JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) &&
 930            journal->j_flags & JBD2_BARRIER) {
 931                blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL);
 932        }
 933
 934        if (err)
 935                jbd2_journal_abort(journal, err);
 936
 937        /*
 938         * Now disk caches for filesystem device are flushed so we are safe to
 939         * erase checkpointed transactions from the log by updating journal
 940         * superblock.
 941         */
 942        if (update_tail)
 943                jbd2_update_log_tail(journal, first_tid, first_block);
 944
 945        /* End of a transaction!  Finally, we can do checkpoint
 946           processing: any buffers committed as a result of this
 947           transaction can be removed from any checkpoint list it was on
 948           before. */
 949
 950        jbd_debug(3, "JBD2: commit phase 6\n");
 951
 952        J_ASSERT(list_empty(&commit_transaction->t_inode_list));
 953        J_ASSERT(commit_transaction->t_buffers == NULL);
 954        J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
 955        J_ASSERT(commit_transaction->t_iobuf_list == NULL);
 956        J_ASSERT(commit_transaction->t_shadow_list == NULL);
 957        J_ASSERT(commit_transaction->t_log_list == NULL);
 958
 959restart_loop:
 960        /*
 961         * As there are other places (journal_unmap_buffer()) adding buffers
 962         * to this list we have to be careful and hold the j_list_lock.
 963         */
 964        spin_lock(&journal->j_list_lock);
 965        while (commit_transaction->t_forget) {
 966                transaction_t *cp_transaction;
 967                struct buffer_head *bh;
 968                int try_to_free = 0;
 969
 970                jh = commit_transaction->t_forget;
 971                spin_unlock(&journal->j_list_lock);
 972                bh = jh2bh(jh);
 973                /*
 974                 * Get a reference so that bh cannot be freed before we are
 975                 * done with it.
 976                 */
 977                get_bh(bh);
 978                jbd_lock_bh_state(bh);
 979                J_ASSERT_JH(jh, jh->b_transaction == commit_transaction);
 980
 981                /*
 982                 * If there is undo-protected committed data against
 983                 * this buffer, then we can remove it now.  If it is a
 984                 * buffer needing such protection, the old frozen_data
 985                 * field now points to a committed version of the
 986                 * buffer, so rotate that field to the new committed
 987                 * data.
 988                 *
 989                 * Otherwise, we can just throw away the frozen data now.
 990                 *
 991                 * We also know that the frozen data has already fired
 992                 * its triggers if they exist, so we can clear that too.
 993                 */
 994                if (jh->b_committed_data) {
 995                        jbd2_free(jh->b_committed_data, bh->b_size);
 996                        jh->b_committed_data = NULL;
 997                        if (jh->b_frozen_data) {
 998                                jh->b_committed_data = jh->b_frozen_data;
 999                                jh->b_frozen_data = NULL;
1000                                jh->b_frozen_triggers = NULL;
1001                        }
1002                } else if (jh->b_frozen_data) {
1003                        jbd2_free(jh->b_frozen_data, bh->b_size);
1004                        jh->b_frozen_data = NULL;
1005                        jh->b_frozen_triggers = NULL;
1006                }
1007
1008                spin_lock(&journal->j_list_lock);
1009                cp_transaction = jh->b_cp_transaction;
1010                if (cp_transaction) {
1011                        JBUFFER_TRACE(jh, "remove from old cp transaction");
1012                        cp_transaction->t_chp_stats.cs_dropped++;
1013                        __jbd2_journal_remove_checkpoint(jh);
1014                }
1015
1016                /* Only re-checkpoint the buffer_head if it is marked
1017                 * dirty.  If the buffer was added to the BJ_Forget list
1018                 * by jbd2_journal_forget, it may no longer be dirty and
1019                 * there's no point in keeping a checkpoint record for
1020                 * it. */
1021
1022                /*
1023                * A buffer which has been freed while still being journaled by
1024                * a previous transaction.
1025                */
1026                if (buffer_freed(bh)) {
1027                        /*
1028                         * If the running transaction is the one containing
1029                         * "add to orphan" operation (b_next_transaction !=
1030                         * NULL), we have to wait for that transaction to
1031                         * commit before we can really get rid of the buffer.
1032                         * So just clear b_modified to not confuse transaction
1033                         * credit accounting and refile the buffer to
1034                         * BJ_Forget of the running transaction. If the just
1035                         * committed transaction contains "add to orphan"
1036                         * operation, we can completely invalidate the buffer
1037                         * now. We are rather through in that since the
1038                         * buffer may be still accessible when blocksize <
1039                         * pagesize and it is attached to the last partial
1040                         * page.
1041                         */
1042                        jh->b_modified = 0;
1043                        if (!jh->b_next_transaction) {
1044                                clear_buffer_freed(bh);
1045                                clear_buffer_jbddirty(bh);
1046                                clear_buffer_mapped(bh);
1047                                clear_buffer_new(bh);
1048                                clear_buffer_req(bh);
1049                                bh->b_bdev = NULL;
1050                        }
1051                }
1052
1053                if (buffer_jbddirty(bh)) {
1054                        JBUFFER_TRACE(jh, "add to new checkpointing trans");
1055                        __jbd2_journal_insert_checkpoint(jh, commit_transaction);
1056                        if (is_journal_aborted(journal))
1057                                clear_buffer_jbddirty(bh);
1058                } else {
1059                        J_ASSERT_BH(bh, !buffer_dirty(bh));
1060                        /*
1061                         * The buffer on BJ_Forget list and not jbddirty means
1062                         * it has been freed by this transaction and hence it
1063                         * could not have been reallocated until this
1064                         * transaction has committed. *BUT* it could be
1065                         * reallocated once we have written all the data to
1066                         * disk and before we process the buffer on BJ_Forget
1067                         * list.
1068                         */
1069                        if (!jh->b_next_transaction)
1070                                try_to_free = 1;
1071                }
1072                JBUFFER_TRACE(jh, "refile or unfile buffer");
1073                __jbd2_journal_refile_buffer(jh);
1074                jbd_unlock_bh_state(bh);
1075                if (try_to_free)
1076                        release_buffer_page(bh);        /* Drops bh reference */
1077                else
1078                        __brelse(bh);
1079                cond_resched_lock(&journal->j_list_lock);
1080        }
1081        spin_unlock(&journal->j_list_lock);
1082        /*
1083         * This is a bit sleazy.  We use j_list_lock to protect transition
1084         * of a transaction into T_FINISHED state and calling
1085         * __jbd2_journal_drop_transaction(). Otherwise we could race with
1086         * other checkpointing code processing the transaction...
1087         */
1088        write_lock(&journal->j_state_lock);
1089        spin_lock(&journal->j_list_lock);
1090        /*
1091         * Now recheck if some buffers did not get attached to the transaction
1092         * while the lock was dropped...
1093         */
1094        if (commit_transaction->t_forget) {
1095                spin_unlock(&journal->j_list_lock);
1096                write_unlock(&journal->j_state_lock);
1097                goto restart_loop;
1098        }
1099
1100        /* Done with this transaction! */
1101
1102        jbd_debug(3, "JBD2: commit phase 7\n");
1103
1104        J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
1105
1106        commit_transaction->t_start = jiffies;
1107        stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
1108                                              commit_transaction->t_start);
1109
1110        /*
1111         * File the transaction statistics
1112         */
1113        stats.ts_tid = commit_transaction->t_tid;
1114        stats.run.rs_handle_count =
1115                atomic_read(&commit_transaction->t_handle_count);
1116        trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
1117                             commit_transaction->t_tid, &stats.run);
1118
1119        /*
1120         * Calculate overall stats
1121         */
1122        spin_lock(&journal->j_history_lock);
1123        journal->j_stats.ts_tid++;
1124        if (commit_transaction->t_requested)
1125                journal->j_stats.ts_requested++;
1126        journal->j_stats.run.rs_wait += stats.run.rs_wait;
1127        journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay;
1128        journal->j_stats.run.rs_running += stats.run.rs_running;
1129        journal->j_stats.run.rs_locked += stats.run.rs_locked;
1130        journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
1131        journal->j_stats.run.rs_logging += stats.run.rs_logging;
1132        journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1133        journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1134        journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1135        spin_unlock(&journal->j_history_lock);
1136
1137        commit_transaction->t_state = T_COMMIT_CALLBACK;
1138        J_ASSERT(commit_transaction == journal->j_committing_transaction);
1139        journal->j_commit_sequence = commit_transaction->t_tid;
1140        journal->j_committing_transaction = NULL;
1141        commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1142
1143        /*
1144         * weight the commit time higher than the average time so we don't
1145         * react too strongly to vast changes in the commit time
1146         */
1147        if (likely(journal->j_average_commit_time))
1148                journal->j_average_commit_time = (commit_time +
1149                                journal->j_average_commit_time*3) / 4;
1150        else
1151                journal->j_average_commit_time = commit_time;
1152
1153        write_unlock(&journal->j_state_lock);
1154
1155        if (journal->j_checkpoint_transactions == NULL) {
1156                journal->j_checkpoint_transactions = commit_transaction;
1157                commit_transaction->t_cpnext = commit_transaction;
1158                commit_transaction->t_cpprev = commit_transaction;
1159        } else {
1160                commit_transaction->t_cpnext =
1161                        journal->j_checkpoint_transactions;
1162                commit_transaction->t_cpprev =
1163                        commit_transaction->t_cpnext->t_cpprev;
1164                commit_transaction->t_cpnext->t_cpprev =
1165                        commit_transaction;
1166                commit_transaction->t_cpprev->t_cpnext =
1167                                commit_transaction;
1168        }
1169        spin_unlock(&journal->j_list_lock);
1170        /* Drop all spin_locks because commit_callback may be block.
1171         * __journal_remove_checkpoint() can not destroy transaction
1172         * under us because it is not marked as T_FINISHED yet */
1173        if (journal->j_commit_callback)
1174                journal->j_commit_callback(journal, commit_transaction);
1175
1176        trace_jbd2_end_commit(journal, commit_transaction);
1177        jbd_debug(1, "JBD2: commit %d complete, head %d\n",
1178                  journal->j_commit_sequence, journal->j_tail_sequence);
1179
1180        write_lock(&journal->j_state_lock);
1181        spin_lock(&journal->j_list_lock);
1182        commit_transaction->t_state = T_FINISHED;
1183        /* Recheck checkpoint lists after j_list_lock was dropped */
1184        if (commit_transaction->t_checkpoint_list == NULL &&
1185            commit_transaction->t_checkpoint_io_list == NULL) {
1186                __jbd2_journal_drop_transaction(journal, commit_transaction);
1187                jbd2_journal_free_transaction(commit_transaction);
1188        }
1189        spin_unlock(&journal->j_list_lock);
1190        write_unlock(&journal->j_state_lock);
1191        wake_up(&journal->j_wait_done_commit);
1192}
1193