LXR linux/fs/jbd2/journal.c

   1// SPDX-License-Identifier: GPL-2.0+
   2/*
   3 * linux/fs/jbd2/journal.c
   4 *
   5 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
   6 *
   7 * Copyright 1998 Red Hat corp --- All Rights Reserved
   8 *
   9 * Generic filesystem journal-writing code; part of the ext2fs
  10 * journaling system.
  11 *
  12 * This file manages journals: areas of disk reserved for logging
  13 * transactional updates.  This includes the kernel journaling thread
  14 * which is responsible for scheduling updates to the log.
  15 *
  16 * We do not actually manage the physical storage of the journal in this
  17 * file: that is left to a per-journal policy function, which allows us
  18 * to store the journal within a filesystem-specified area for ext2
  19 * journaling (ext2 can use a reserved inode for storing the log).
  20 */
  21
  22#include <linux/module.h>
  23#include <linux/time.h>
  24#include <linux/fs.h>
  25#include <linux/jbd2.h>
  26#include <linux/errno.h>
  27#include <linux/slab.h>
  28#include <linux/init.h>
  29#include <linux/mm.h>
  30#include <linux/freezer.h>
  31#include <linux/pagemap.h>
  32#include <linux/kthread.h>
  33#include <linux/poison.h>
  34#include <linux/proc_fs.h>
  35#include <linux/seq_file.h>
  36#include <linux/math64.h>
  37#include <linux/hash.h>
  38#include <linux/log2.h>
  39#include <linux/vmalloc.h>
  40#include <linux/backing-dev.h>
  41#include <linux/bitops.h>
  42#include <linux/ratelimit.h>
  43#include <linux/sched/mm.h>
  44
  45#define CREATE_TRACE_POINTS
  46#include <trace/events/jbd2.h>
  47
  48#include <linux/uaccess.h>
  49#include <asm/page.h>
  50
  51#ifdef CONFIG_JBD2_DEBUG
  52ushort jbd2_journal_enable_debug __read_mostly;
  53EXPORT_SYMBOL(jbd2_journal_enable_debug);
  54
  55module_param_named(jbd2_debug, jbd2_journal_enable_debug, ushort, 0644);
  56MODULE_PARM_DESC(jbd2_debug, "Debugging level for jbd2");
  57#endif
  58
  59EXPORT_SYMBOL(jbd2_journal_extend);
  60EXPORT_SYMBOL(jbd2_journal_stop);
  61EXPORT_SYMBOL(jbd2_journal_lock_updates);
  62EXPORT_SYMBOL(jbd2_journal_unlock_updates);
  63EXPORT_SYMBOL(jbd2_journal_get_write_access);
  64EXPORT_SYMBOL(jbd2_journal_get_create_access);
  65EXPORT_SYMBOL(jbd2_journal_get_undo_access);
  66EXPORT_SYMBOL(jbd2_journal_set_triggers);
  67EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
  68EXPORT_SYMBOL(jbd2_journal_forget);
  69#if 0
  70EXPORT_SYMBOL(journal_sync_buffer);
  71#endif
  72EXPORT_SYMBOL(jbd2_journal_flush);
  73EXPORT_SYMBOL(jbd2_journal_revoke);
  74
  75EXPORT_SYMBOL(jbd2_journal_init_dev);
  76EXPORT_SYMBOL(jbd2_journal_init_inode);
  77EXPORT_SYMBOL(jbd2_journal_check_used_features);
  78EXPORT_SYMBOL(jbd2_journal_check_available_features);
  79EXPORT_SYMBOL(jbd2_journal_set_features);
  80EXPORT_SYMBOL(jbd2_journal_load);
  81EXPORT_SYMBOL(jbd2_journal_destroy);
  82EXPORT_SYMBOL(jbd2_journal_abort);
  83EXPORT_SYMBOL(jbd2_journal_errno);
  84EXPORT_SYMBOL(jbd2_journal_ack_err);
  85EXPORT_SYMBOL(jbd2_journal_clear_err);
  86EXPORT_SYMBOL(jbd2_log_wait_commit);
  87EXPORT_SYMBOL(jbd2_log_start_commit);
  88EXPORT_SYMBOL(jbd2_journal_start_commit);
  89EXPORT_SYMBOL(jbd2_journal_force_commit_nested);
  90EXPORT_SYMBOL(jbd2_journal_wipe);
  91EXPORT_SYMBOL(jbd2_journal_blocks_per_page);
  92EXPORT_SYMBOL(jbd2_journal_invalidatepage);
  93EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
  94EXPORT_SYMBOL(jbd2_journal_force_commit);
  95EXPORT_SYMBOL(jbd2_journal_inode_add_write);
  96EXPORT_SYMBOL(jbd2_journal_inode_add_wait);
  97EXPORT_SYMBOL(jbd2_journal_inode_ranged_write);
  98EXPORT_SYMBOL(jbd2_journal_inode_ranged_wait);
  99EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
 100EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
 101EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
 102EXPORT_SYMBOL(jbd2_inode_cache);
 103
 104static int jbd2_journal_create_slab(size_t slab_size);
 105
 106#ifdef CONFIG_JBD2_DEBUG
 107void __jbd2_debug(int level, const char *file, const char *func,
 108                  unsigned int line, const char *fmt, ...)
 109{
 110        struct va_format vaf;
 111        va_list args;
 112
 113        if (level > jbd2_journal_enable_debug)
 114                return;
 115        va_start(args, fmt);
 116        vaf.fmt = fmt;
 117        vaf.va = &args;
 118        printk(KERN_DEBUG "%s: (%s, %u): %pV", file, func, line, &vaf);
 119        va_end(args);
 120}
 121EXPORT_SYMBOL(__jbd2_debug);
 122#endif
 123
 124/* Checksumming functions */
 125static int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb)
 126{
 127        if (!jbd2_journal_has_csum_v2or3_feature(j))
 128                return 1;
 129
 130        return sb->s_checksum_type == JBD2_CRC32C_CHKSUM;
 131}
 132
 133static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb)
 134{
 135        __u32 csum;
 136        __be32 old_csum;
 137
 138        old_csum = sb->s_checksum;
 139        sb->s_checksum = 0;
 140        csum = jbd2_chksum(j, ~0, (char *)sb, sizeof(journal_superblock_t));
 141        sb->s_checksum = old_csum;
 142
 143        return cpu_to_be32(csum);
 144}
 145
 146/*
 147 * Helper function used to manage commit timeouts
 148 */
 149
 150static void commit_timeout(struct timer_list *t)
 151{
 152        journal_t *journal = from_timer(journal, t, j_commit_timer);
 153
 154        wake_up_process(journal->j_task);
 155}
 156
 157/*
 158 * kjournald2: The main thread function used to manage a logging device
 159 * journal.
 160 *
 161 * This kernel thread is responsible for two things:
 162 *
 163 * 1) COMMIT:  Every so often we need to commit the current state of the
 164 *    filesystem to disk.  The journal thread is responsible for writing
 165 *    all of the metadata buffers to disk.
 166 *
 167 * 2) CHECKPOINT: We cannot reuse a used section of the log file until all
 168 *    of the data in that part of the log has been rewritten elsewhere on
 169 *    the disk.  Flushing these old buffers to reclaim space in the log is
 170 *    known as checkpointing, and this thread is responsible for that job.
 171 */
 172
 173static int kjournald2(void *arg)
 174{
 175        journal_t *journal = arg;
 176        transaction_t *transaction;
 177
 178        /*
 179         * Set up an interval timer which can be used to trigger a commit wakeup
 180         * after the commit interval expires
 181         */
 182        timer_setup(&journal->j_commit_timer, commit_timeout, 0);
 183
 184        set_freezable();
 185
 186        /* Record that the journal thread is running */
 187        journal->j_task = current;
 188        wake_up(&journal->j_wait_done_commit);
 189
 190        /*
 191         * Make sure that no allocations from this kernel thread will ever
 192         * recurse to the fs layer because we are responsible for the
 193         * transaction commit and any fs involvement might get stuck waiting for
 194         * the trasn. commit.
 195         */
 196        memalloc_nofs_save();
 197
 198        /*
 199         * And now, wait forever for commit wakeup events.
 200         */
 201        write_lock(&journal->j_state_lock);
 202
 203loop:
 204        if (journal->j_flags & JBD2_UNMOUNT)
 205                goto end_loop;
 206
 207        jbd_debug(1, "commit_sequence=%u, commit_request=%u\n",
 208                journal->j_commit_sequence, journal->j_commit_request);
 209
 210        if (journal->j_commit_sequence != journal->j_commit_request) {
 211                jbd_debug(1, "OK, requests differ\n");
 212                write_unlock(&journal->j_state_lock);
 213                del_timer_sync(&journal->j_commit_timer);
 214                jbd2_journal_commit_transaction(journal);
 215                write_lock(&journal->j_state_lock);
 216                goto loop;
 217        }
 218
 219        wake_up(&journal->j_wait_done_commit);
 220        if (freezing(current)) {
 221                /*
 222                 * The simpler the better. Flushing journal isn't a
 223                 * good idea, because that depends on threads that may
 224                 * be already stopped.
 225                 */
 226                jbd_debug(1, "Now suspending kjournald2\n");
 227                write_unlock(&journal->j_state_lock);
 228                try_to_freeze();
 229                write_lock(&journal->j_state_lock);
 230        } else {
 231                /*
 232                 * We assume on resume that commits are already there,
 233                 * so we don't sleep
 234                 */
 235                DEFINE_WAIT(wait);
 236                int should_sleep = 1;
 237
 238                prepare_to_wait(&journal->j_wait_commit, &wait,
 239                                TASK_INTERRUPTIBLE);
 240                if (journal->j_commit_sequence != journal->j_commit_request)
 241                        should_sleep = 0;
 242                transaction = journal->j_running_transaction;
 243                if (transaction && time_after_eq(jiffies,
 244                                                transaction->t_expires))
 245                        should_sleep = 0;
 246                if (journal->j_flags & JBD2_UNMOUNT)
 247                        should_sleep = 0;
 248                if (should_sleep) {
 249                        write_unlock(&journal->j_state_lock);
 250                        schedule();
 251                        write_lock(&journal->j_state_lock);
 252                }
 253                finish_wait(&journal->j_wait_commit, &wait);
 254        }
 255
 256        jbd_debug(1, "kjournald2 wakes\n");
 257
 258        /*
 259         * Were we woken up by a commit wakeup event?
 260         */
 261        transaction = journal->j_running_transaction;
 262        if (transaction && time_after_eq(jiffies, transaction->t_expires)) {
 263                journal->j_commit_request = transaction->t_tid;
 264                jbd_debug(1, "woke because of timeout\n");
 265        }
 266        goto loop;
 267
 268end_loop:
 269        del_timer_sync(&journal->j_commit_timer);
 270        journal->j_task = NULL;
 271        wake_up(&journal->j_wait_done_commit);
 272        jbd_debug(1, "Journal thread exiting.\n");
 273        write_unlock(&journal->j_state_lock);
 274        return 0;
 275}
 276
 277static int jbd2_journal_start_thread(journal_t *journal)
 278{
 279        struct task_struct *t;
 280
 281        t = kthread_run(kjournald2, journal, "jbd2/%s",
 282                        journal->j_devname);
 283        if (IS_ERR(t))
 284                return PTR_ERR(t);
 285
 286        wait_event(journal->j_wait_done_commit, journal->j_task != NULL);
 287        return 0;
 288}
 289
 290static void journal_kill_thread(journal_t *journal)
 291{
 292        write_lock(&journal->j_state_lock);
 293        journal->j_flags |= JBD2_UNMOUNT;
 294
 295        while (journal->j_task) {
 296                write_unlock(&journal->j_state_lock);
 297                wake_up(&journal->j_wait_commit);
 298                wait_event(journal->j_wait_done_commit, journal->j_task == NULL);
 299                write_lock(&journal->j_state_lock);
 300        }
 301        write_unlock(&journal->j_state_lock);
 302}
 303
 304/*
 305 * jbd2_journal_write_metadata_buffer: write a metadata buffer to the journal.
 306 *
 307 * Writes a metadata buffer to a given disk block.  The actual IO is not
 308 * performed but a new buffer_head is constructed which labels the data
 309 * to be written with the correct destination disk block.
 310 *
 311 * Any magic-number escaping which needs to be done will cause a
 312 * copy-out here.  If the buffer happens to start with the
 313 * JBD2_MAGIC_NUMBER, then we can't write it to the log directly: the
 314 * magic number is only written to the log for descripter blocks.  In
 315 * this case, we copy the data and replace the first word with 0, and we
 316 * return a result code which indicates that this buffer needs to be
 317 * marked as an escaped buffer in the corresponding log descriptor
 318 * block.  The missing word can then be restored when the block is read
 319 * during recovery.
 320 *
 321 * If the source buffer has already been modified by a new transaction
 322 * since we took the last commit snapshot, we use the frozen copy of
 323 * that data for IO. If we end up using the existing buffer_head's data
 324 * for the write, then we have to make sure nobody modifies it while the
 325 * IO is in progress. do_get_write_access() handles this.
 326 *
 327 * The function returns a pointer to the buffer_head to be used for IO.
 328 *
 329 *
 330 * Return value:
 331 *  <0: Error
 332 * >=0: Finished OK
 333 *
 334 * On success:
 335 * Bit 0 set == escape performed on the data
 336 * Bit 1 set == buffer copy-out performed (kfree the data after IO)
 337 */
 338
 339int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
 340                                  struct journal_head  *jh_in,
 341                                  struct buffer_head **bh_out,
 342                                  sector_t blocknr)
 343{
 344        int need_copy_out = 0;
 345        int done_copy_out = 0;
 346        int do_escape = 0;
 347        char *mapped_data;
 348        struct buffer_head *new_bh;
 349        struct page *new_page;
 350        unsigned int new_offset;
 351        struct buffer_head *bh_in = jh2bh(jh_in);
 352        journal_t *journal = transaction->t_journal;
 353
 354        /*
 355         * The buffer really shouldn't be locked: only the current committing
 356         * transaction is allowed to write it, so nobody else is allowed
 357         * to do any IO.
 358         *
 359         * akpm: except if we're journalling data, and write() output is
 360         * also part of a shared mapping, and another thread has
 361         * decided to launch a writepage() against this buffer.
 362         */
 363        J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
 364
 365        new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL);
 366
 367        /* keep subsequent assertions sane */
 368        atomic_set(&new_bh->b_count, 1);
 369
 370        jbd_lock_bh_state(bh_in);
 371repeat:
 372        /*
 373         * If a new transaction has already done a buffer copy-out, then
 374         * we use that version of the data for the commit.
 375         */
 376        if (jh_in->b_frozen_data) {
 377                done_copy_out = 1;
 378                new_page = virt_to_page(jh_in->b_frozen_data);
 379                new_offset = offset_in_page(jh_in->b_frozen_data);
 380        } else {
 381                new_page = jh2bh(jh_in)->b_page;
 382                new_offset = offset_in_page(jh2bh(jh_in)->b_data);
 383        }
 384
 385        mapped_data = kmap_atomic(new_page);
 386        /*
 387         * Fire data frozen trigger if data already wasn't frozen.  Do this
 388         * before checking for escaping, as the trigger may modify the magic
 389         * offset.  If a copy-out happens afterwards, it will have the correct
 390         * data in the buffer.
 391         */
 392        if (!done_copy_out)
 393                jbd2_buffer_frozen_trigger(jh_in, mapped_data + new_offset,
 394                                           jh_in->b_triggers);
 395
 396        /*
 397         * Check for escaping
 398         */
 399        if (*((__be32 *)(mapped_data + new_offset)) ==
 400                                cpu_to_be32(JBD2_MAGIC_NUMBER)) {
 401                need_copy_out = 1;
 402                do_escape = 1;
 403        }
 404        kunmap_atomic(mapped_data);
 405
 406        /*
 407         * Do we need to do a data copy?
 408         */
 409        if (need_copy_out && !done_copy_out) {
 410                char *tmp;
 411
 412                jbd_unlock_bh_state(bh_in);
 413                tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS);
 414                if (!tmp) {
 415                        brelse(new_bh);
 416                        return -ENOMEM;
 417                }
 418                jbd_lock_bh_state(bh_in);
 419                if (jh_in->b_frozen_data) {
 420                        jbd2_free(tmp, bh_in->b_size);
 421                        goto repeat;
 422                }
 423
 424                jh_in->b_frozen_data = tmp;
 425                mapped_data = kmap_atomic(new_page);
 426                memcpy(tmp, mapped_data + new_offset, bh_in->b_size);
 427                kunmap_atomic(mapped_data);
 428
 429                new_page = virt_to_page(tmp);
 430                new_offset = offset_in_page(tmp);
 431                done_copy_out = 1;
 432
 433                /*
 434                 * This isn't strictly necessary, as we're using frozen
 435                 * data for the escaping, but it keeps consistency with
 436                 * b_frozen_data usage.
 437                 */
 438                jh_in->b_frozen_triggers = jh_in->b_triggers;
 439        }
 440
 441        /*
 442         * Did we need to do an escaping?  Now we've done all the
 443         * copying, we can finally do so.
 444         */
 445        if (do_escape) {
 446                mapped_data = kmap_atomic(new_page);
 447                *((unsigned int *)(mapped_data + new_offset)) = 0;
 448                kunmap_atomic(mapped_data);
 449        }
 450
 451        set_bh_page(new_bh, new_page, new_offset);
 452        new_bh->b_size = bh_in->b_size;
 453        new_bh->b_bdev = journal->j_dev;
 454        new_bh->b_blocknr = blocknr;
 455        new_bh->b_private = bh_in;
 456        set_buffer_mapped(new_bh);
 457        set_buffer_dirty(new_bh);
 458
 459        *bh_out = new_bh;
 460
 461        /*
 462         * The to-be-written buffer needs to get moved to the io queue,
 463         * and the original buffer whose contents we are shadowing or
 464         * copying is moved to the transaction's shadow queue.
 465         */
 466        JBUFFER_TRACE(jh_in, "file as BJ_Shadow");
 467        spin_lock(&journal->j_list_lock);
 468        __jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
 469        spin_unlock(&journal->j_list_lock);
 470        set_buffer_shadow(bh_in);
 471        jbd_unlock_bh_state(bh_in);
 472
 473        return do_escape | (done_copy_out << 1);
 474}
 475
 476/*
 477 * Allocation code for the journal file.  Manage the space left in the
 478 * journal, so that we can begin checkpointing when appropriate.
 479 */
 480
 481/*
 482 * Called with j_state_lock locked for writing.
 483 * Returns true if a transaction commit was started.
 484 */
 485int __jbd2_log_start_commit(journal_t *journal, tid_t target)
 486{
 487        /* Return if the txn has already requested to be committed */
 488        if (journal->j_commit_request == target)
 489                return 0;
 490
 491        /*
 492         * The only transaction we can possibly wait upon is the
 493         * currently running transaction (if it exists).  Otherwise,
 494         * the target tid must be an old one.
 495         */
 496        if (journal->j_running_transaction &&
 497            journal->j_running_transaction->t_tid == target) {
 498                /*
 499                 * We want a new commit: OK, mark the request and wakeup the
 500                 * commit thread.  We do _not_ do the commit ourselves.
 501                 */
 502
 503                journal->j_commit_request = target;
 504                jbd_debug(1, "JBD2: requesting commit %u/%u\n",
 505                          journal->j_commit_request,
 506                          journal->j_commit_sequence);
 507                journal->j_running_transaction->t_requested = jiffies;
 508                wake_up(&journal->j_wait_commit);
 509                return 1;
 510        } else if (!tid_geq(journal->j_commit_request, target))
 511                /* This should never happen, but if it does, preserve
 512                   the evidence before kjournald goes into a loop and
 513                   increments j_commit_sequence beyond all recognition. */
 514                WARN_ONCE(1, "JBD2: bad log_start_commit: %u %u %u %u\n",
 515                          journal->j_commit_request,
 516                          journal->j_commit_sequence,
 517                          target, journal->j_running_transaction ?
 518                          journal->j_running_transaction->t_tid : 0);
 519        return 0;
 520}
 521
 522int jbd2_log_start_commit(journal_t *journal, tid_t tid)
 523{
 524        int ret;
 525
 526        write_lock(&journal->j_state_lock);
 527        ret = __jbd2_log_start_commit(journal, tid);
 528        write_unlock(&journal->j_state_lock);
 529        return ret;
 530}
 531
 532/*
 533 * Force and wait any uncommitted transactions.  We can only force the running
 534 * transaction if we don't have an active handle, otherwise, we will deadlock.
 535 * Returns: <0 in case of error,
 536 *           0 if nothing to commit,
 537 *           1 if transaction was successfully committed.
 538 */
 539static int __jbd2_journal_force_commit(journal_t *journal)
 540{
 541        transaction_t *transaction = NULL;
 542        tid_t tid;
 543        int need_to_start = 0, ret = 0;
 544
 545        read_lock(&journal->j_state_lock);
 546        if (journal->j_running_transaction && !current->journal_info) {
 547                transaction = journal->j_running_transaction;
 548                if (!tid_geq(journal->j_commit_request, transaction->t_tid))
 549                        need_to_start = 1;
 550        } else if (journal->j_committing_transaction)
 551                transaction = journal->j_committing_transaction;
 552
 553        if (!transaction) {
 554                /* Nothing to commit */
 555                read_unlock(&journal->j_state_lock);
 556                return 0;
 557        }
 558        tid = transaction->t_tid;
 559        read_unlock(&journal->j_state_lock);
 560        if (need_to_start)
 561                jbd2_log_start_commit(journal, tid);
 562        ret = jbd2_log_wait_commit(journal, tid);
 563        if (!ret)
 564                ret = 1;
 565
 566        return ret;
 567}
 568
 569/**
 570 * Force and wait upon a commit if the calling process is not within
 571 * transaction.  This is used for forcing out undo-protected data which contains
 572 * bitmaps, when the fs is running out of space.
 573 *
 574 * @journal: journal to force
 575 * Returns true if progress was made.
 576 */
 577int jbd2_journal_force_commit_nested(journal_t *journal)
 578{
 579        int ret;
 580
 581        ret = __jbd2_journal_force_commit(journal);
 582        return ret > 0;
 583}
 584
 585/**
 586 * int journal_force_commit() - force any uncommitted transactions
 587 * @journal: journal to force
 588 *
 589 * Caller want unconditional commit. We can only force the running transaction
 590 * if we don't have an active handle, otherwise, we will deadlock.
 591 */
 592int jbd2_journal_force_commit(journal_t *journal)
 593{
 594        int ret;
 595
 596        J_ASSERT(!current->journal_info);
 597        ret = __jbd2_journal_force_commit(journal);
 598        if (ret > 0)
 599                ret = 0;
 600        return ret;
 601}
 602
 603/*
 604 * Start a commit of the current running transaction (if any).  Returns true
 605 * if a transaction is going to be committed (or is currently already
 606 * committing), and fills its tid in at *ptid
 607 */
 608int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
 609{
 610        int ret = 0;
 611
 612        write_lock(&journal->j_state_lock);
 613        if (journal->j_running_transaction) {
 614                tid_t tid = journal->j_running_transaction->t_tid;
 615
 616                __jbd2_log_start_commit(journal, tid);
 617                /* There's a running transaction and we've just made sure
 618                 * it's commit has been scheduled. */
 619                if (ptid)
 620                        *ptid = tid;
 621                ret = 1;
 622        } else if (journal->j_committing_transaction) {
 623                /*
 624                 * If commit has been started, then we have to wait for
 625                 * completion of that transaction.
 626                 */
 627                if (ptid)
 628                        *ptid = journal->j_committing_transaction->t_tid;
 629                ret = 1;
 630        }
 631        write_unlock(&journal->j_state_lock);
 632        return ret;
 633}
 634
 635/*
 636 * Return 1 if a given transaction has not yet sent barrier request
 637 * connected with a transaction commit. If 0 is returned, transaction
 638 * may or may not have sent the barrier. Used to avoid sending barrier
 639 * twice in common cases.
 640 */
 641int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid)
 642{
 643        int ret = 0;
 644        transaction_t *commit_trans;
 645
 646        if (!(journal->j_flags & JBD2_BARRIER))
 647                return 0;
 648        read_lock(&journal->j_state_lock);
 649        /* Transaction already committed? */
 650        if (tid_geq(journal->j_commit_sequence, tid))
 651                goto out;
 652        commit_trans = journal->j_committing_transaction;
 653        if (!commit_trans || commit_trans->t_tid != tid) {
 654                ret = 1;
 655                goto out;
 656        }
 657        /*
 658         * Transaction is being committed and we already proceeded to
 659         * submitting a flush to fs partition?
 660         */
 661        if (journal->j_fs_dev != journal->j_dev) {
 662                if (!commit_trans->t_need_data_flush ||
 663                    commit_trans->t_state >= T_COMMIT_DFLUSH)
 664                        goto out;
 665        } else {
 666                if (commit_trans->t_state >= T_COMMIT_JFLUSH)
 667                        goto out;
 668        }
 669        ret = 1;
 670out:
 671        read_unlock(&journal->j_state_lock);
 672        return ret;
 673}
 674EXPORT_SYMBOL(jbd2_trans_will_send_data_barrier);
 675
 676/*
 677 * Wait for a specified commit to complete.
 678 * The caller may not hold the journal lock.
 679 */
 680int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
 681{
 682        int err = 0;
 683
 684        read_lock(&journal->j_state_lock);
 685#ifdef CONFIG_PROVE_LOCKING
 686        /*
 687         * Some callers make sure transaction is already committing and in that
 688         * case we cannot block on open handles anymore. So don't warn in that
 689         * case.
 690         */
 691        if (tid_gt(tid, journal->j_commit_sequence) &&
 692            (!journal->j_committing_transaction ||
 693             journal->j_committing_transaction->t_tid != tid)) {
 694                read_unlock(&journal->j_state_lock);
 695                jbd2_might_wait_for_commit(journal);
 696                read_lock(&journal->j_state_lock);
 697        }
 698#endif
 699#ifdef CONFIG_JBD2_DEBUG
 700        if (!tid_geq(journal->j_commit_request, tid)) {
 701                printk(KERN_ERR
 702                       "%s: error: j_commit_request=%u, tid=%u\n",
 703                       __func__, journal->j_commit_request, tid);
 704        }
 705#endif
 706        while (tid_gt(tid, journal->j_commit_sequence)) {
 707                jbd_debug(1, "JBD2: want %u, j_commit_sequence=%u\n",
 708                                  tid, journal->j_commit_sequence);
 709                read_unlock(&journal->j_state_lock);
 710                wake_up(&journal->j_wait_commit);
 711                wait_event(journal->j_wait_done_commit,
 712                                !tid_gt(tid, journal->j_commit_sequence));
 713                read_lock(&journal->j_state_lock);
 714        }
 715        read_unlock(&journal->j_state_lock);
 716
 717        if (unlikely(is_journal_aborted(journal)))
 718                err = -EIO;
 719        return err;
 720}
 721
 722/* Return 1 when transaction with given tid has already committed. */
 723int jbd2_transaction_committed(journal_t *journal, tid_t tid)
 724{
 725        int ret = 1;
 726
 727        read_lock(&journal->j_state_lock);
 728        if (journal->j_running_transaction &&
 729            journal->j_running_transaction->t_tid == tid)
 730                ret = 0;
 731        if (journal->j_committing_transaction &&
 732            journal->j_committing_transaction->t_tid == tid)
 733                ret = 0;
 734        read_unlock(&journal->j_state_lock);
 735        return ret;
 736}
 737EXPORT_SYMBOL(jbd2_transaction_committed);
 738
 739/*
 740 * When this function returns the transaction corresponding to tid
 741 * will be completed.  If the transaction has currently running, start
 742 * committing that transaction before waiting for it to complete.  If
 743 * the transaction id is stale, it is by definition already completed,
 744 * so just return SUCCESS.
 745 */
 746int jbd2_complete_transaction(journal_t *journal, tid_t tid)
 747{
 748        int     need_to_wait = 1;
 749
 750        read_lock(&journal->j_state_lock);
 751        if (journal->j_running_transaction &&
 752            journal->j_running_transaction->t_tid == tid) {
 753                if (journal->j_commit_request != tid) {
 754                        /* transaction not yet started, so request it */
 755                        read_unlock(&journal->j_state_lock);
 756                        jbd2_log_start_commit(journal, tid);
 757                        goto wait_commit;
 758                }
 759        } else if (!(journal->j_committing_transaction &&
 760                     journal->j_committing_transaction->t_tid == tid))
 761                need_to_wait = 0;
 762        read_unlock(&journal->j_state_lock);
 763        if (!need_to_wait)
 764                return 0;
 765wait_commit:
 766        return jbd2_log_wait_commit(journal, tid);
 767}
 768EXPORT_SYMBOL(jbd2_complete_transaction);
 769
 770/*
 771 * Log buffer allocation routines:
 772 */
 773
 774int jbd2_journal_next_log_block(journal_t *journal, unsigned long long *retp)
 775{
 776        unsigned long blocknr;
 777
 778        write_lock(&journal->j_state_lock);
 779        J_ASSERT(journal->j_free > 1);
 780
 781        blocknr = journal->j_head;
 782        journal->j_head++;
 783        journal->j_free--;
 784        if (journal->j_head == journal->j_last)
 785                journal->j_head = journal->j_first;
 786        write_unlock(&journal->j_state_lock);
 787        return jbd2_journal_bmap(journal, blocknr, retp);
 788}
 789
 790/*
 791 * Conversion of logical to physical block numbers for the journal
 792 *
 793 * On external journals the journal blocks are identity-mapped, so
 794 * this is a no-op.  If needed, we can use j_blk_offset - everything is
 795 * ready.
 796 */
 797int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr,
 798                 unsigned long long *retp)
 799{
 800        int err = 0;
 801        unsigned long long ret;
 802
 803        if (journal->j_inode) {
 804                ret = bmap(journal->j_inode, blocknr);
 805                if (ret)
 806                        *retp = ret;
 807                else {
 808                        printk(KERN_ALERT "%s: journal block not found "
 809                                        "at offset %lu on %s\n",
 810                               __func__, blocknr, journal->j_devname);
 811                        err = -EIO;
 812                        jbd2_journal_abort(journal, err);
 813                }
 814        } else {
 815                *retp = blocknr; /* +journal->j_blk_offset */
 816        }
 817        return err;
 818}
 819
 820/*
 821 * We play buffer_head aliasing tricks to write data/metadata blocks to
 822 * the journal without copying their contents, but for journal
 823 * descriptor blocks we do need to generate bona fide buffers.
 824 *
 825 * After the caller of jbd2_journal_get_descriptor_buffer() has finished modifying
 826 * the buffer's contents they really should run flush_dcache_page(bh->b_page).
 827 * But we don't bother doing that, so there will be coherency problems with
 828 * mmaps of blockdevs which hold live JBD-controlled filesystems.
 829 */
 830struct buffer_head *
 831jbd2_journal_get_descriptor_buffer(transaction_t *transaction, int type)
 832{
 833        journal_t *journal = transaction->t_journal;
 834        struct buffer_head *bh;
 835        unsigned long long blocknr;
 836        journal_header_t *header;
 837        int err;
 838
 839        err = jbd2_journal_next_log_block(journal, &blocknr);
 840
 841        if (err)
 842                return NULL;
 843
 844        bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
 845        if (!bh)
 846                return NULL;
 847        atomic_dec(&transaction->t_outstanding_credits);
 848        lock_buffer(bh);
 849        memset(bh->b_data, 0, journal->j_blocksize);
 850        header = (journal_header_t *)bh->b_data;
 851        header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
 852        header->h_blocktype = cpu_to_be32(type);
 853        header->h_sequence = cpu_to_be32(transaction->t_tid);
 854        set_buffer_uptodate(bh);
 855        unlock_buffer(bh);
 856        BUFFER_TRACE(bh, "return this buffer");
 857        return bh;
 858}
 859
 860void jbd2_descriptor_block_csum_set(journal_t *j, struct buffer_head *bh)
 861{
 862        struct jbd2_journal_block_tail *tail;
 863        __u32 csum;
 864
 865        if (!jbd2_journal_has_csum_v2or3(j))
 866                return;
 867
 868        tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize -
 869                        sizeof(struct jbd2_journal_block_tail));
 870        tail->t_checksum = 0;
 871        csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
 872        tail->t_checksum = cpu_to_be32(csum);
 873}
 874
 875/*
 876 * Return tid of the oldest transaction in the journal and block in the journal
 877 * where the transaction starts.
 878 *
 879 * If the journal is now empty, return which will be the next transaction ID
 880 * we will write and where will that transaction start.
 881 *
 882 * The return value is 0 if journal tail cannot be pushed any further, 1 if
 883 * it can.
 884 */
 885int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid,
 886                              unsigned long *block)
 887{
 888        transaction_t *transaction;
 889        int ret;
 890
 891        read_lock(&journal->j_state_lock);
 892        spin_lock(&journal->j_list_lock);
 893        transaction = journal->j_checkpoint_transactions;
 894        if (transaction) {
 895                *tid = transaction->t_tid;
 896                *block = transaction->t_log_start;
 897        } else if ((transaction = journal->j_committing_transaction) != NULL) {
 898                *tid = transaction->t_tid;
 899                *block = transaction->t_log_start;
 900        } else if ((transaction = journal->j_running_transaction) != NULL) {
 901                *tid = transaction->t_tid;
 902                *block = journal->j_head;
 903        } else {
 904                *tid = journal->j_transaction_sequence;
 905                *block = journal->j_head;
 906        }
 907        ret = tid_gt(*tid, journal->j_tail_sequence);
 908        spin_unlock(&journal->j_list_lock);
 909        read_unlock(&journal->j_state_lock);
 910
 911        return ret;
 912}
 913
 914/*
 915 * Update information in journal structure and in on disk journal superblock
 916 * about log tail. This function does not check whether information passed in
 917 * really pushes log tail further. It's responsibility of the caller to make
 918 * sure provided log tail information is valid (e.g. by holding
 919 * j_checkpoint_mutex all the time between computing log tail and calling this
 920 * function as is the case with jbd2_cleanup_journal_tail()).
 921 *
 922 * Requires j_checkpoint_mutex
 923 */
 924int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
 925{
 926        unsigned long freed;
 927        int ret;
 928
 929        BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
 930
 931        /*
 932         * We cannot afford for write to remain in drive's caches since as
 933         * soon as we update j_tail, next transaction can start reusing journal
 934         * space and if we lose sb update during power failure we'd replay
 935         * old transaction with possibly newly overwritten data.
 936         */
 937        ret = jbd2_journal_update_sb_log_tail(journal, tid, block,
 938                                              REQ_SYNC | REQ_FUA);
 939        if (ret)
 940                goto out;
 941
 942        write_lock(&journal->j_state_lock);
 943        freed = block - journal->j_tail;
 944        if (block < journal->j_tail)
 945                freed += journal->j_last - journal->j_first;
 946
 947        trace_jbd2_update_log_tail(journal, tid, block, freed);
 948        jbd_debug(1,
 949                  "Cleaning journal tail from %u to %u (offset %lu), "
 950                  "freeing %lu\n",
 951                  journal->j_tail_sequence, tid, block, freed);
 952
 953        journal->j_free += freed;
 954        journal->j_tail_sequence = tid;
 955        journal->j_tail = block;
 956        write_unlock(&journal->j_state_lock);
 957
 958out:
 959        return ret;
 960}
 961
 962/*
 963 * This is a variation of __jbd2_update_log_tail which checks for validity of
 964 * provided log tail and locks j_checkpoint_mutex. So it is safe against races
 965 * with other threads updating log tail.
 966 */
 967void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
 968{
 969        mutex_lock_io(&journal->j_checkpoint_mutex);
 970        if (tid_gt(tid, journal->j_tail_sequence))
 971                __jbd2_update_log_tail(journal, tid, block);
 972        mutex_unlock(&journal->j_checkpoint_mutex);
 973}
 974
 975struct jbd2_stats_proc_session {
 976        journal_t *journal;
 977        struct transaction_stats_s *stats;
 978        int start;
 979        int max;
 980};
 981
 982static void *jbd2_seq_info_start(struct seq_file *seq, loff_t *pos)
 983{
 984        return *pos ? NULL : SEQ_START_TOKEN;
 985}
 986
 987static void *jbd2_seq_info_next(struct seq_file *seq, void *v, loff_t *pos)
 988{
 989        (*pos)++;
 990        return NULL;
 991}
 992
 993static int jbd2_seq_info_show(struct seq_file *seq, void *v)
 994{
 995        struct jbd2_stats_proc_session *s = seq->private;
 996
 997        if (v != SEQ_START_TOKEN)
 998                return 0;
 999        seq_printf(seq, "%lu transactions (%lu requested), "
1000                   "each up to %u blocks\n",

1001                   s->stats->ts_tid, s->stats->ts_requested,
1002                   s->journal->j_max_transaction_buffers);
1003        if (s->stats->ts_tid == 0)
1004                return 0;
1005        seq_printf(seq, "average: \n  %ums waiting for transaction\n",
1006            jiffies_to_msecs(s->stats->run.rs_wait / s->stats->ts_tid));
1007        seq_printf(seq, "  %ums request delay\n",
1008            (s->stats->ts_requested == 0) ? 0 :
1009            jiffies_to_msecs(s->stats->run.rs_request_delay /
1010                             s->stats->ts_requested));
1011        seq_printf(seq, "  %ums running transaction\n",
1012            jiffies_to_msecs(s->stats->run.rs_running / s->stats->ts_tid));
1013        seq_printf(seq, "  %ums transaction was being locked\n",
1014            jiffies_to_msecs(s->stats->run.rs_locked / s->stats->ts_tid));
1015        seq_printf(seq, "  %ums flushing data (in ordered mode)\n",
1016            jiffies_to_msecs(s->stats->run.rs_flushing / s->stats->ts_tid));
1017        seq_printf(seq, "  %ums logging transaction\n",
1018            jiffies_to_msecs(s->stats->run.rs_logging / s->stats->ts_tid));
1019        seq_printf(seq, "  %lluus average transaction commit time\n",
1020                   div_u64(s->journal->j_average_commit_time, 1000));
1021        seq_printf(seq, "  %lu handles per transaction\n",
1022            s->stats->run.rs_handle_count / s->stats->ts_tid);
1023        seq_printf(seq, "  %lu blocks per transaction\n",
1024            s->stats->run.rs_blocks / s->stats->ts_tid);
1025        seq_printf(seq, "  %lu logged blocks per transaction\n",
1026            s->stats->run.rs_blocks_logged / s->stats->ts_tid);
1027        return 0;
1028}
1029
1030static void jbd2_seq_info_stop(struct seq_file *seq, void *v)
1031{
1032}
1033
1034static const struct seq_operations jbd2_seq_info_ops = {
1035        .start  = jbd2_seq_info_start,
1036        .next   = jbd2_seq_info_next,
1037        .stop   = jbd2_seq_info_stop,
1038        .show   = jbd2_seq_info_show,
1039};
1040
1041static int jbd2_seq_info_open(struct inode *inode, struct file *file)
1042{
1043        journal_t *journal = PDE_DATA(inode);
1044        struct jbd2_stats_proc_session *s;
1045        int rc, size;
1046
1047        s = kmalloc(sizeof(*s), GFP_KERNEL);
1048        if (s == NULL)
1049                return -ENOMEM;
1050        size = sizeof(struct transaction_stats_s);
1051        s->stats = kmalloc(size, GFP_KERNEL);
1052        if (s->stats == NULL) {
1053                kfree(s);
1054                return -ENOMEM;
1055        }
1056        spin_lock(&journal->j_history_lock);
1057        memcpy(s->stats, &journal->j_stats, size);
1058        s->journal = journal;
1059        spin_unlock(&journal->j_history_lock);
1060
1061        rc = seq_open(file, &jbd2_seq_info_ops);
1062        if (rc == 0) {
1063                struct seq_file *m = file->private_data;
1064                m->private = s;
1065        } else {
1066                kfree(s->stats);
1067                kfree(s);
1068        }
1069        return rc;
1070
1071}
1072
1073static int jbd2_seq_info_release(struct inode *inode, struct file *file)
1074{
1075        struct seq_file *seq = file->private_data;
1076        struct jbd2_stats_proc_session *s = seq->private;
1077        kfree(s->stats);
1078        kfree(s);
1079        return seq_release(inode, file);
1080}
1081
1082static const struct file_operations jbd2_seq_info_fops = {
1083        .owner          = THIS_MODULE,
1084        .open           = jbd2_seq_info_open,
1085        .read           = seq_read,
1086        .llseek         = seq_lseek,
1087        .release        = jbd2_seq_info_release,
1088};
1089
1090static struct proc_dir_entry *proc_jbd2_stats;
1091
1092static void jbd2_stats_proc_init(journal_t *journal)
1093{
1094        journal->j_proc_entry = proc_mkdir(journal->j_devname, proc_jbd2_stats);
1095        if (journal->j_proc_entry) {
1096                proc_create_data("info", S_IRUGO, journal->j_proc_entry,
1097                                 &jbd2_seq_info_fops, journal);
1098        }
1099}
1100
1101static void jbd2_stats_proc_exit(journal_t *journal)
1102{
1103        remove_proc_entry("info", journal->j_proc_entry);
1104        remove_proc_entry(journal->j_devname, proc_jbd2_stats);
1105}
1106
1107/* Minimum size of descriptor tag */
1108static int jbd2_min_tag_size(void)
1109{
1110        /*
1111         * Tag with 32-bit block numbers does not use last four bytes of the
1112         * structure
1113         */
1114        return sizeof(journal_block_tag_t) - 4;
1115}
1116
1117/*
1118 * Management for journal control blocks: functions to create and
1119 * destroy journal_t structures, and to initialise and read existing
1120 * journal blocks from disk.  */
1121
1122/* First: create and setup a journal_t object in memory.  We initialise
1123 * very few fields yet: that has to wait until we have created the
1124 * journal structures from from scratch, or loaded them from disk. */
1125
1126static journal_t *journal_init_common(struct block_device *bdev,
1127                        struct block_device *fs_dev,
1128                        unsigned long long start, int len, int blocksize)
1129{
1130        static struct lock_class_key jbd2_trans_commit_key;
1131        journal_t *journal;
1132        int err;
1133        struct buffer_head *bh;
1134        int n;
1135
1136        journal = kzalloc(sizeof(*journal), GFP_KERNEL);
1137        if (!journal)
1138                return NULL;
1139
1140        init_waitqueue_head(&journal->j_wait_transaction_locked);
1141        init_waitqueue_head(&journal->j_wait_done_commit);
1142        init_waitqueue_head(&journal->j_wait_commit);
1143        init_waitqueue_head(&journal->j_wait_updates);
1144        init_waitqueue_head(&journal->j_wait_reserved);
1145        mutex_init(&journal->j_abort_mutex);
1146        mutex_init(&journal->j_barrier);
1147        mutex_init(&journal->j_checkpoint_mutex);
1148        spin_lock_init(&journal->j_revoke_lock);
1149        spin_lock_init(&journal->j_list_lock);
1150        rwlock_init(&journal->j_state_lock);
1151
1152        journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE);
1153        journal->j_min_batch_time = 0;
1154        journal->j_max_batch_time = 15000; /* 15ms */
1155        atomic_set(&journal->j_reserved_credits, 0);
1156
1157        /* The journal is marked for error until we succeed with recovery! */
1158        journal->j_flags = JBD2_ABORT;
1159
1160        /* Set up a default-sized revoke table for the new mount. */
1161        err = jbd2_journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH);
1162        if (err)
1163                goto err_cleanup;
1164
1165        spin_lock_init(&journal->j_history_lock);
1166
1167        lockdep_init_map(&journal->j_trans_commit_map, "jbd2_handle",
1168                         &jbd2_trans_commit_key, 0);
1169
1170        /* journal descriptor can store up to n blocks -bzzz */
1171        journal->j_blocksize = blocksize;
1172        journal->j_dev = bdev;
1173        journal->j_fs_dev = fs_dev;
1174        journal->j_blk_offset = start;
1175        journal->j_maxlen = len;
1176        /* We need enough buffers to write out full descriptor block. */
1177        n = journal->j_blocksize / jbd2_min_tag_size();
1178        journal->j_wbufsize = n;
1179        journal->j_wbuf = kmalloc_array(n, sizeof(struct buffer_head *),
1180                                        GFP_KERNEL);
1181        if (!journal->j_wbuf)
1182                goto err_cleanup;
1183
1184        bh = getblk_unmovable(journal->j_dev, start, journal->j_blocksize);
1185        if (!bh) {
1186                pr_err("%s: Cannot get buffer for journal superblock\n",
1187                        __func__);
1188                goto err_cleanup;
1189        }
1190        journal->j_sb_buffer = bh;
1191        journal->j_superblock = (journal_superblock_t *)bh->b_data;
1192
1193        return journal;
1194
1195err_cleanup:
1196        kfree(journal->j_wbuf);
1197        jbd2_journal_destroy_revoke(journal);
1198        kfree(journal);
1199        return NULL;
1200}
1201
1202/* jbd2_journal_init_dev and jbd2_journal_init_inode:
1203 *
1204 * Create a journal structure assigned some fixed set of disk blocks to
1205 * the journal.  We don't actually touch those disk blocks yet, but we
1206 * need to set up all of the mapping information to tell the journaling
1207 * system where the journal blocks are.
1208 *
1209 */
1210
1211/**
1212 *  journal_t * jbd2_journal_init_dev() - creates and initialises a journal structure
1213 *  @bdev: Block device on which to create the journal
1214 *  @fs_dev: Device which hold journalled filesystem for this journal.
1215 *  @start: Block nr Start of journal.
1216 *  @len:  Length of the journal in blocks.
1217 *  @blocksize: blocksize of journalling device
1218 *
1219 *  Returns: a newly created journal_t *
1220 *
1221 *  jbd2_journal_init_dev creates a journal which maps a fixed contiguous
1222 *  range of blocks on an arbitrary block device.
1223 *
1224 */
1225journal_t *jbd2_journal_init_dev(struct block_device *bdev,
1226                        struct block_device *fs_dev,
1227                        unsigned long long start, int len, int blocksize)
1228{
1229        journal_t *journal;
1230
1231        journal = journal_init_common(bdev, fs_dev, start, len, blocksize);
1232        if (!journal)
1233                return NULL;
1234
1235        bdevname(journal->j_dev, journal->j_devname);
1236        strreplace(journal->j_devname, '/', '!');
1237        jbd2_stats_proc_init(journal);
1238
1239        return journal;
1240}
1241
1242/**
1243 *  journal_t * jbd2_journal_init_inode () - creates a journal which maps to a inode.
1244 *  @inode: An inode to create the journal in
1245 *
1246 * jbd2_journal_init_inode creates a journal which maps an on-disk inode as
1247 * the journal.  The inode must exist already, must support bmap() and
1248 * must have all data blocks preallocated.
1249 */
1250journal_t *jbd2_journal_init_inode(struct inode *inode)
1251{
1252        journal_t *journal;
1253        char *p;
1254        unsigned long long blocknr;
1255
1256        blocknr = bmap(inode, 0);
1257        if (!blocknr) {
1258                pr_err("%s: Cannot locate journal superblock\n",
1259                        __func__);
1260                return NULL;
1261        }
1262
1263        jbd_debug(1, "JBD2: inode %s/%ld, size %lld, bits %d, blksize %ld\n",
1264                  inode->i_sb->s_id, inode->i_ino, (long long) inode->i_size,
1265                  inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize);
1266
1267        journal = journal_init_common(inode->i_sb->s_bdev, inode->i_sb->s_bdev,
1268                        blocknr, inode->i_size >> inode->i_sb->s_blocksize_bits,
1269                        inode->i_sb->s_blocksize);
1270        if (!journal)
1271                return NULL;
1272
1273        journal->j_inode = inode;
1274        bdevname(journal->j_dev, journal->j_devname);
1275        p = strreplace(journal->j_devname, '/', '!');
1276        sprintf(p, "-%lu", journal->j_inode->i_ino);
1277        jbd2_stats_proc_init(journal);
1278
1279        return journal;
1280}
1281
1282/*
1283 * If the journal init or create aborts, we need to mark the journal
1284 * superblock as being NULL to prevent the journal destroy from writing
1285 * back a bogus superblock.
1286 */
1287static void journal_fail_superblock (journal_t *journal)
1288{
1289        struct buffer_head *bh = journal->j_sb_buffer;
1290        brelse(bh);
1291        journal->j_sb_buffer = NULL;
1292}
1293
1294/*
1295 * Given a journal_t structure, initialise the various fields for
1296 * startup of a new journaling session.  We use this both when creating
1297 * a journal, and after recovering an old journal to reset it for
1298 * subsequent use.
1299 */
1300
1301static int journal_reset(journal_t *journal)
1302{
1303        journal_superblock_t *sb = journal->j_superblock;
1304        unsigned long long first, last;
1305
1306        first = be32_to_cpu(sb->s_first);
1307        last = be32_to_cpu(sb->s_maxlen);
1308        if (first + JBD2_MIN_JOURNAL_BLOCKS > last + 1) {
1309                printk(KERN_ERR "JBD2: Journal too short (blocks %llu-%llu).\n",
1310                       first, last);
1311                journal_fail_superblock(journal);
1312                return -EINVAL;
1313        }
1314
1315        journal->j_first = first;
1316        journal->j_last = last;
1317
1318        journal->j_head = first;
1319        journal->j_tail = first;
1320        journal->j_free = last - first;
1321
1322        journal->j_tail_sequence = journal->j_transaction_sequence;
1323        journal->j_commit_sequence = journal->j_transaction_sequence - 1;
1324        journal->j_commit_request = journal->j_commit_sequence;
1325
1326        journal->j_max_transaction_buffers = journal->j_maxlen / 4;
1327
1328        /*
1329         * As a special case, if the on-disk copy is already marked as needing
1330         * no recovery (s_start == 0), then we can safely defer the superblock
1331         * update until the next commit by setting JBD2_FLUSHED.  This avoids
1332         * attempting a write to a potential-readonly device.
1333         */
1334        if (sb->s_start == 0) {
1335                jbd_debug(1, "JBD2: Skipping superblock update on recovered sb "
1336                        "(start %ld, seq %u, errno %d)\n",
1337                        journal->j_tail, journal->j_tail_sequence,
1338                        journal->j_errno);
1339                journal->j_flags |= JBD2_FLUSHED;
1340        } else {
1341                /* Lock here to make assertions happy... */
1342                mutex_lock_io(&journal->j_checkpoint_mutex);
1343                /*
1344                 * Update log tail information. We use REQ_FUA since new
1345                 * transaction will start reusing journal space and so we
1346                 * must make sure information about current log tail is on
1347                 * disk before that.
1348                 */
1349                jbd2_journal_update_sb_log_tail(journal,
1350                                                journal->j_tail_sequence,
1351                                                journal->j_tail,
1352                                                REQ_SYNC | REQ_FUA);
1353                mutex_unlock(&journal->j_checkpoint_mutex);
1354        }
1355        return jbd2_journal_start_thread(journal);
1356}
1357
1358/*
1359 * This function expects that the caller will have locked the journal
1360 * buffer head, and will return with it unlocked
1361 */
1362static int jbd2_write_superblock(journal_t *journal, int write_flags)
1363{
1364        struct buffer_head *bh = journal->j_sb_buffer;
1365        journal_superblock_t *sb = journal->j_superblock;
1366        int ret;
1367
1368        /* Buffer got discarded which means block device got invalidated */
1369        if (!buffer_mapped(bh)) {
1370                unlock_buffer(bh);
1371                return -EIO;
1372        }
1373
1374        trace_jbd2_write_superblock(journal, write_flags);
1375        if (!(journal->j_flags & JBD2_BARRIER))
1376                write_flags &= ~(REQ_FUA | REQ_PREFLUSH);
1377        if (buffer_write_io_error(bh)) {
1378                /*
1379                 * Oh, dear.  A previous attempt to write the journal
1380                 * superblock failed.  This could happen because the
1381                 * USB device was yanked out.  Or it could happen to
1382                 * be a transient write error and maybe the block will
1383                 * be remapped.  Nothing we can do but to retry the
1384                 * write and hope for the best.
1385                 */
1386                printk(KERN_ERR "JBD2: previous I/O error detected "
1387                       "for journal superblock update for %s.\n",
1388                       journal->j_devname);
1389                clear_buffer_write_io_error(bh);
1390                set_buffer_uptodate(bh);
1391        }
1392        if (jbd2_journal_has_csum_v2or3(journal))
1393                sb->s_checksum = jbd2_superblock_csum(journal, sb);
1394        get_bh(bh);
1395        bh->b_end_io = end_buffer_write_sync;
1396        ret = submit_bh(REQ_OP_WRITE, write_flags, bh);
1397        wait_on_buffer(bh);
1398        if (buffer_write_io_error(bh)) {
1399                clear_buffer_write_io_error(bh);
1400                set_buffer_uptodate(bh);
1401                ret = -EIO;
1402        }
1403        if (ret) {
1404                printk(KERN_ERR "JBD2: Error %d detected when updating "
1405                       "journal superblock for %s.\n", ret,
1406                       journal->j_devname);
1407                if (!is_journal_aborted(journal))
1408                        jbd2_journal_abort(journal, ret);
1409        }
1410
1411        return ret;
1412}
1413
1414/**
1415 * jbd2_journal_update_sb_log_tail() - Update log tail in journal sb on disk.
1416 * @journal: The journal to update.
1417 * @tail_tid: TID of the new transaction at the tail of the log
1418 * @tail_block: The first block of the transaction at the tail of the log
1419 * @write_op: With which operation should we write the journal sb
1420 *
1421 * Update a journal's superblock information about log tail and write it to
1422 * disk, waiting for the IO to complete.
1423 */
1424int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
1425                                     unsigned long tail_block, int write_op)
1426{
1427        journal_superblock_t *sb = journal->j_superblock;
1428        int ret;
1429
1430        if (is_journal_aborted(journal))
1431                return -EIO;
1432
1433        BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
1434        jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n",
1435                  tail_block, tail_tid);
1436
1437        lock_buffer(journal->j_sb_buffer);
1438        sb->s_sequence = cpu_to_be32(tail_tid);
1439        sb->s_start    = cpu_to_be32(tail_block);
1440
1441        ret = jbd2_write_superblock(journal, write_op);
1442        if (ret)
1443                goto out;
1444
1445        /* Log is no longer empty */
1446        write_lock(&journal->j_state_lock);
1447        WARN_ON(!sb->s_sequence);
1448        journal->j_flags &= ~JBD2_FLUSHED;
1449        write_unlock(&journal->j_state_lock);
1450
1451out:
1452        return ret;
1453}
1454
1455/**
1456 * jbd2_mark_journal_empty() - Mark on disk journal as empty.
1457 * @journal: The journal to update.
1458 * @write_op: With which operation should we write the journal sb
1459 *
1460 * Update a journal's dynamic superblock fields to show that journal is empty.
1461 * Write updated superblock to disk waiting for IO to complete.
1462 */
1463static void jbd2_mark_journal_empty(journal_t *journal, int write_op)
1464{
1465        journal_superblock_t *sb = journal->j_superblock;
1466
1467        BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
1468        lock_buffer(journal->j_sb_buffer);
1469        if (sb->s_start == 0) {         /* Is it already empty? */
1470                unlock_buffer(journal->j_sb_buffer);
1471                return;
1472        }
1473
1474        jbd_debug(1, "JBD2: Marking journal as empty (seq %u)\n",
1475                  journal->j_tail_sequence);
1476
1477        sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
1478        sb->s_start    = cpu_to_be32(0);
1479
1480        jbd2_write_superblock(journal, write_op);
1481
1482        /* Log is no longer empty */
1483        write_lock(&journal->j_state_lock);
1484        journal->j_flags |= JBD2_FLUSHED;
1485        write_unlock(&journal->j_state_lock);
1486}
1487
1488
1489/**
1490 * jbd2_journal_update_sb_errno() - Update error in the journal.
1491 * @journal: The journal to update.
1492 *
1493 * Update a journal's errno.  Write updated superblock to disk waiting for IO
1494 * to complete.
1495 */
1496void jbd2_journal_update_sb_errno(journal_t *journal)
1497{
1498        journal_superblock_t *sb = journal->j_superblock;
1499        int errcode;
1500
1501        lock_buffer(journal->j_sb_buffer);
1502        errcode = journal->j_errno;
1503        if (errcode == -ESHUTDOWN)
1504                errcode = 0;
1505        jbd_debug(1, "JBD2: updating superblock error (errno %d)\n", errcode);
1506        sb->s_errno    = cpu_to_be32(errcode);
1507
1508        jbd2_write_superblock(journal, REQ_SYNC | REQ_FUA);
1509}
1510EXPORT_SYMBOL(jbd2_journal_update_sb_errno);
1511
1512static int journal_revoke_records_per_block(journal_t *journal)
1513{
1514        int record_size;
1515        int space = journal->j_blocksize - sizeof(jbd2_journal_revoke_header_t);
1516
1517        if (jbd2_has_feature_64bit(journal))
1518                record_size = 8;
1519        else
1520                record_size = 4;
1521
1522        if (jbd2_journal_has_csum_v2or3(journal))
1523                space -= sizeof(struct jbd2_journal_block_tail);
1524        return space / record_size;
1525}
1526
1527/*
1528 * Read the superblock for a given journal, performing initial
1529 * validation of the format.
1530 */
1531static int journal_get_superblock(journal_t *journal)
1532{
1533        struct buffer_head *bh;
1534        journal_superblock_t *sb;
1535        int err = -EIO;
1536
1537        bh = journal->j_sb_buffer;
1538
1539        J_ASSERT(bh != NULL);
1540        if (!buffer_uptodate(bh)) {
1541                ll_rw_block(REQ_OP_READ, 0, 1, &bh);
1542                wait_on_buffer(bh);
1543                if (!buffer_uptodate(bh)) {
1544                        printk(KERN_ERR
1545                                "JBD2: IO error reading journal superblock\n");
1546                        goto out;
1547                }
1548        }
1549
1550        if (buffer_verified(bh))
1551                return 0;
1552
1553        sb = journal->j_superblock;
1554
1555        err = -EINVAL;
1556
1557        if (sb->s_header.h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER) ||
1558            sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) {
1559                printk(KERN_WARNING "JBD2: no valid journal superblock found\n");
1560                goto out;
1561        }
1562
1563        switch(be32_to_cpu(sb->s_header.h_blocktype)) {
1564        case JBD2_SUPERBLOCK_V1:
1565                journal->j_format_version = 1;
1566                break;
1567        case JBD2_SUPERBLOCK_V2:
1568                journal->j_format_version = 2;
1569                break;
1570        default:
1571                printk(KERN_WARNING "JBD2: unrecognised superblock format ID\n");
1572                goto out;
1573        }
1574
1575        if (be32_to_cpu(sb->s_maxlen) < journal->j_maxlen)
1576                journal->j_maxlen = be32_to_cpu(sb->s_maxlen);
1577        else if (be32_to_cpu(sb->s_maxlen) > journal->j_maxlen) {
1578                printk(KERN_WARNING "JBD2: journal file too short\n");
1579                goto out;
1580        }
1581
1582        if (be32_to_cpu(sb->s_first) == 0 ||
1583            be32_to_cpu(sb->s_first) >= journal->j_maxlen) {
1584                printk(KERN_WARNING
1585                        "JBD2: Invalid start block of journal: %u\n",
1586                        be32_to_cpu(sb->s_first));
1587                goto out;
1588        }
1589
1590        if (jbd2_has_feature_csum2(journal) &&
1591            jbd2_has_feature_csum3(journal)) {
1592                /* Can't have checksum v2 and v3 at the same time! */
1593                printk(KERN_ERR "JBD2: Can't enable checksumming v2 and v3 "
1594                       "at the same time!\n");
1595                goto out;
1596        }
1597
1598        if (jbd2_journal_has_csum_v2or3_feature(journal) &&
1599            jbd2_has_feature_checksum(journal)) {
1600                /* Can't have checksum v1 and v2 on at the same time! */
1601                printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2/3 "
1602                       "at the same time!\n");
1603                goto out;
1604        }
1605
1606        if (!jbd2_verify_csum_type(journal, sb)) {
1607                printk(KERN_ERR "JBD2: Unknown checksum type\n");
1608                goto out;
1609        }
1610
1611        /* Load the checksum driver */
1612        if (jbd2_journal_has_csum_v2or3_feature(journal)) {
1613                journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
1614                if (IS_ERR(journal->j_chksum_driver)) {
1615                        printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n");
1616                        err = PTR_ERR(journal->j_chksum_driver);
1617                        journal->j_chksum_driver = NULL;
1618                        goto out;
1619                }
1620        }
1621
1622        if (jbd2_journal_has_csum_v2or3(journal)) {
1623                /* Check superblock checksum */
1624                if (sb->s_checksum != jbd2_superblock_csum(journal, sb)) {
1625                        printk(KERN_ERR "JBD2: journal checksum error\n");
1626                        err = -EFSBADCRC;
1627                        goto out;
1628                }
1629
1630                /* Precompute checksum seed for all metadata */
1631                journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid,
1632                                                   sizeof(sb->s_uuid));
1633        }
1634
1635        journal->j_revoke_records_per_block =
1636                                journal_revoke_records_per_block(journal);
1637        set_buffer_verified(bh);
1638
1639        return 0;
1640
1641out:
1642        journal_fail_superblock(journal);
1643        return err;
1644}
1645
1646/*
1647 * Load the on-disk journal superblock and read the key fields into the
1648 * journal_t.
1649 */
1650
1651static int load_superblock(journal_t *journal)
1652{
1653        int err;
1654        journal_superblock_t *sb;
1655
1656        err = journal_get_superblock(journal);
1657        if (err)
1658                return err;
1659
1660        sb = journal->j_superblock;
1661
1662        journal->j_tail_sequence = be32_to_cpu(sb->s_sequence);
1663        journal->j_tail = be32_to_cpu(sb->s_start);
1664        journal->j_first = be32_to_cpu(sb->s_first);
1665        journal->j_last = be32_to_cpu(sb->s_maxlen);
1666        journal->j_errno = be32_to_cpu(sb->s_errno);
1667
1668        return 0;
1669}
1670
1671
1672/**
1673 * int jbd2_journal_load() - Read journal from disk.
1674 * @journal: Journal to act on.
1675 *
1676 * Given a journal_t structure which tells us which disk blocks contain
1677 * a journal, read the journal from disk to initialise the in-memory
1678 * structures.
1679 */
1680int jbd2_journal_load(journal_t *journal)
1681{
1682        int err;
1683        journal_superblock_t *sb;
1684
1685        err = load_superblock(journal);
1686        if (err)
1687                return err;
1688
1689        sb = journal->j_superblock;
1690        /* If this is a V2 superblock, then we have to check the
1691         * features flags on it. */
1692
1693        if (journal->j_format_version >= 2) {
1694                if ((sb->s_feature_ro_compat &
1695                     ~cpu_to_be32(JBD2_KNOWN_ROCOMPAT_FEATURES)) ||
1696                    (sb->s_feature_incompat &
1697                     ~cpu_to_be32(JBD2_KNOWN_INCOMPAT_FEATURES))) {
1698                        printk(KERN_WARNING
1699                                "JBD2: Unrecognised features on journal\n");
1700                        return -EINVAL;
1701                }
1702        }
1703
1704        /*
1705         * Create a slab for this blocksize
1706         */
1707        err = jbd2_journal_create_slab(be32_to_cpu(sb->s_blocksize));
1708        if (err)
1709                return err;
1710
1711        /* Let the recovery code check whether it needs to recover any
1712         * data from the journal. */
1713        if (jbd2_journal_recover(journal))
1714                goto recovery_error;
1715
1716        if (journal->j_failed_commit) {
1717                printk(KERN_ERR "JBD2: journal transaction %u on %s "
1718                       "is corrupt.\n", journal->j_failed_commit,
1719                       journal->j_devname);
1720                return -EFSCORRUPTED;
1721        }
1722        /*
1723         * clear JBD2_ABORT flag initialized in journal_init_common
1724         * here to update log tail information with the newest seq.
1725         */
1726        journal->j_flags &= ~JBD2_ABORT;
1727
1728        /* OK, we've finished with the dynamic journal bits:
1729         * reinitialise the dynamic contents of the superblock in memory
1730         * and reset them on disk. */
1731        if (journal_reset(journal))
1732                goto recovery_error;
1733
1734        journal->j_flags |= JBD2_LOADED;
1735        return 0;
1736
1737recovery_error:
1738        printk(KERN_WARNING "JBD2: recovery failed\n");
1739        return -EIO;
1740}
1741
1742/**
1743 * void jbd2_journal_destroy() - Release a journal_t structure.
1744 * @journal: Journal to act on.
1745 *
1746 * Release a journal_t structure once it is no longer in use by the
1747 * journaled object.
1748 * Return <0 if we couldn't clean up the journal.
1749 */
1750int jbd2_journal_destroy(journal_t *journal)
1751{
1752        int err = 0;
1753
1754        /* Wait for the commit thread to wake up and die. */
1755        journal_kill_thread(journal);
1756
1757        /* Force a final log commit */
1758        if (journal->j_running_transaction)
1759                jbd2_journal_commit_transaction(journal);
1760
1761        /* Force any old transactions to disk */
1762
1763        /* Totally anal locking here... */
1764        spin_lock(&journal->j_list_lock);
1765        while (journal->j_checkpoint_transactions != NULL) {
1766                spin_unlock(&journal->j_list_lock);
1767                mutex_lock_io(&journal->j_checkpoint_mutex);
1768                err = jbd2_log_do_checkpoint(journal);
1769                mutex_unlock(&journal->j_checkpoint_mutex);
1770                /*
1771                 * If checkpointing failed, just free the buffers to avoid
1772                 * looping forever
1773                 */
1774                if (err) {
1775                        jbd2_journal_destroy_checkpoint(journal);
1776                        spin_lock(&journal->j_list_lock);
1777                        break;
1778                }
1779                spin_lock(&journal->j_list_lock);
1780        }
1781
1782        J_ASSERT(journal->j_running_transaction == NULL);
1783        J_ASSERT(journal->j_committing_transaction == NULL);
1784        J_ASSERT(journal->j_checkpoint_transactions == NULL);
1785        spin_unlock(&journal->j_list_lock);
1786
1787        if (journal->j_sb_buffer) {
1788                if (!is_journal_aborted(journal)) {
1789                        mutex_lock_io(&journal->j_checkpoint_mutex);
1790
1791                        write_lock(&journal->j_state_lock);
1792                        journal->j_tail_sequence =
1793                                ++journal->j_transaction_sequence;
1794                        write_unlock(&journal->j_state_lock);
1795
1796                        jbd2_mark_journal_empty(journal,
1797                                        REQ_SYNC | REQ_PREFLUSH | REQ_FUA);
1798                        mutex_unlock(&journal->j_checkpoint_mutex);
1799                } else
1800                        err = -EIO;
1801                brelse(journal->j_sb_buffer);
1802        }
1803
1804        if (journal->j_proc_entry)
1805                jbd2_stats_proc_exit(journal);
1806        iput(journal->j_inode);
1807        if (journal->j_revoke)
1808                jbd2_journal_destroy_revoke(journal);
1809        if (journal->j_chksum_driver)
1810                crypto_free_shash(journal->j_chksum_driver);
1811        kfree(journal->j_wbuf);
1812        kfree(journal);
1813
1814        return err;
1815}
1816
1817
1818/**
1819 *int jbd2_journal_check_used_features () - Check if features specified are used.
1820 * @journal: Journal to check.
1821 * @compat: bitmask of compatible features
1822 * @ro: bitmask of features that force read-only mount
1823 * @incompat: bitmask of incompatible features
1824 *
1825 * Check whether the journal uses all of a given set of
1826 * features.  Return true (non-zero) if it does.
1827 **/
1828
1829int jbd2_journal_check_used_features (journal_t *journal, unsigned long compat,
1830                                 unsigned long ro, unsigned long incompat)
1831{
1832        journal_superblock_t *sb;
1833
1834        if (!compat && !ro && !incompat)
1835                return 1;
1836        /* Load journal superblock if it is not loaded yet. */
1837        if (journal->j_format_version == 0 &&
1838            journal_get_superblock(journal) != 0)
1839                return 0;
1840        if (journal->j_format_version == 1)
1841                return 0;
1842
1843        sb = journal->j_superblock;
1844
1845        if (((be32_to_cpu(sb->s_feature_compat) & compat) == compat) &&
1846            ((be32_to_cpu(sb->s_feature_ro_compat) & ro) == ro) &&
1847            ((be32_to_cpu(sb->s_feature_incompat) & incompat) == incompat))
1848                return 1;
1849
1850        return 0;
1851}
1852
1853/**
1854 * int jbd2_journal_check_available_features() - Check feature set in journalling layer
1855 * @journal: Journal to check.
1856 * @compat: bitmask of compatible features
1857 * @ro: bitmask of features that force read-only mount
1858 * @incompat: bitmask of incompatible features
1859 *
1860 * Check whether the journaling code supports the use of
1861 * all of a given set of features on this journal.  Return true
1862 * (non-zero) if it can. */
1863
1864int jbd2_journal_check_available_features (journal_t *journal, unsigned long compat,
1865                                      unsigned long ro, unsigned long incompat)
1866{
1867        if (!compat && !ro && !incompat)
1868                return 1;
1869
1870        /* We can support any known requested features iff the
1871         * superblock is in version 2.  Otherwise we fail to support any
1872         * extended sb features. */
1873
1874        if (journal->j_format_version != 2)
1875                return 0;
1876
1877        if ((compat   & JBD2_KNOWN_COMPAT_FEATURES) == compat &&
1878            (ro       & JBD2_KNOWN_ROCOMPAT_FEATURES) == ro &&
1879            (incompat & JBD2_KNOWN_INCOMPAT_FEATURES) == incompat)
1880                return 1;
1881
1882        return 0;
1883}
1884
1885/**
1886 * int jbd2_journal_set_features () - Mark a given journal feature in the superblock
1887 * @journal: Journal to act on.
1888 * @compat: bitmask of compatible features
1889 * @ro: bitmask of features that force read-only mount
1890 * @incompat: bitmask of incompatible features
1891 *
1892 * Mark a given journal feature as present on the
1893 * superblock.  Returns true if the requested features could be set.
1894 *
1895 */
1896
1897int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
1898                          unsigned long ro, unsigned long incompat)
1899{
1900#define INCOMPAT_FEATURE_ON(f) \
1901                ((incompat & (f)) && !(sb->s_feature_incompat & cpu_to_be32(f)))
1902#define COMPAT_FEATURE_ON(f) \
1903                ((compat & (f)) && !(sb->s_feature_compat & cpu_to_be32(f)))
1904        journal_superblock_t *sb;
1905
1906        if (jbd2_journal_check_used_features(journal, compat, ro, incompat))
1907                return 1;
1908
1909        if (!jbd2_journal_check_available_features(journal, compat, ro, incompat))
1910                return 0;
1911
1912        /* If enabling v2 checksums, turn on v3 instead */
1913        if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V2) {
1914                incompat &= ~JBD2_FEATURE_INCOMPAT_CSUM_V2;
1915                incompat |= JBD2_FEATURE_INCOMPAT_CSUM_V3;
1916        }
1917
1918        /* Asking for checksumming v3 and v1?  Only give them v3. */
1919        if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V3 &&
1920            compat & JBD2_FEATURE_COMPAT_CHECKSUM)
1921                compat &= ~JBD2_FEATURE_COMPAT_CHECKSUM;
1922
1923        jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n",
1924                  compat, ro, incompat);
1925
1926        sb = journal->j_superblock;
1927
1928        /* Load the checksum driver if necessary */
1929        if ((journal->j_chksum_driver == NULL) &&
1930            INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) {
1931                journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
1932                if (IS_ERR(journal->j_chksum_driver)) {
1933                        printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n");
1934                        journal->j_chksum_driver = NULL;
1935                        return 0;
1936                }
1937                /* Precompute checksum seed for all metadata */
1938                journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid,
1939                                                   sizeof(sb->s_uuid));
1940        }
1941
1942        lock_buffer(journal->j_sb_buffer);
1943
1944        /* If enabling v3 checksums, update superblock */
1945        if (INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) {
1946                sb->s_checksum_type = JBD2_CRC32C_CHKSUM;
1947                sb->s_feature_compat &=
1948                        ~cpu_to_be32(JBD2_FEATURE_COMPAT_CHECKSUM);
1949        }
1950
1951        /* If enabling v1 checksums, downgrade superblock */
1952        if (COMPAT_FEATURE_ON(JBD2_FEATURE_COMPAT_CHECKSUM))
1953                sb->s_feature_incompat &=
1954                        ~cpu_to_be32(JBD2_FEATURE_INCOMPAT_CSUM_V2 |
1955                                     JBD2_FEATURE_INCOMPAT_CSUM_V3);
1956
1957        sb->s_feature_compat    |= cpu_to_be32(compat);
1958        sb->s_feature_ro_compat |= cpu_to_be32(ro);
1959        sb->s_feature_incompat  |= cpu_to_be32(incompat);
1960        unlock_buffer(journal->j_sb_buffer);
1961        journal->j_revoke_records_per_block =
1962                                journal_revoke_records_per_block(journal);
1963
1964        return 1;
1965#undef COMPAT_FEATURE_ON
1966#undef INCOMPAT_FEATURE_ON
1967}
1968
1969/*
1970 * jbd2_journal_clear_features () - Clear a given journal feature in the
1971 *                                  superblock
1972 * @journal: Journal to act on.
1973 * @compat: bitmask of compatible features
1974 * @ro: bitmask of features that force read-only mount
1975 * @incompat: bitmask of incompatible features
1976 *
1977 * Clear a given journal feature as present on the
1978 * superblock.
1979 */
1980void jbd2_journal_clear_features(journal_t *journal, unsigned long compat,
1981                                unsigned long ro, unsigned long incompat)
1982{
1983        journal_superblock_t *sb;
1984
1985        jbd_debug(1, "Clear features 0x%lx/0x%lx/0x%lx\n",
1986                  compat, ro, incompat);
1987
1988        sb = journal->j_superblock;
1989
1990        sb->s_feature_compat    &= ~cpu_to_be32(compat);
1991        sb->s_feature_ro_compat &= ~cpu_to_be32(ro);
1992        sb->s_feature_incompat  &= ~cpu_to_be32(incompat);
1993        journal->j_revoke_records_per_block =
1994                                journal_revoke_records_per_block(journal);
1995}
1996EXPORT_SYMBOL(jbd2_journal_clear_features);
1997
1998/**
1999 * int jbd2_journal_flush () - Flush journal
2000 * @journal: Journal to act on.

2001 *
2002 * Flush all data for a given journal to disk and empty the journal.
2003 * Filesystems can use this when remounting readonly to ensure that
2004 * recovery does not need to happen on remount.
2005 */
2006
2007int jbd2_journal_flush(journal_t *journal)
2008{
2009        int err = 0;
2010        transaction_t *transaction = NULL;
2011
2012        write_lock(&journal->j_state_lock);
2013
2014        /* Force everything buffered to the log... */
2015        if (journal->j_running_transaction) {
2016                transaction = journal->j_running_transaction;
2017                __jbd2_log_start_commit(journal, transaction->t_tid);
2018        } else if (journal->j_committing_transaction)
2019                transaction = journal->j_committing_transaction;
2020
2021        /* Wait for the log commit to complete... */
2022        if (transaction) {
2023                tid_t tid = transaction->t_tid;
2024
2025                write_unlock(&journal->j_state_lock);
2026                jbd2_log_wait_commit(journal, tid);
2027        } else {
2028                write_unlock(&journal->j_state_lock);
2029        }
2030
2031        /* ...and flush everything in the log out to disk. */
2032        spin_lock(&journal->j_list_lock);
2033        while (!err && journal->j_checkpoint_transactions != NULL) {
2034                spin_unlock(&journal->j_list_lock);
2035                mutex_lock_io(&journal->j_checkpoint_mutex);
2036                err = jbd2_log_do_checkpoint(journal);
2037                mutex_unlock(&journal->j_checkpoint_mutex);
2038                spin_lock(&journal->j_list_lock);
2039        }
2040        spin_unlock(&journal->j_list_lock);
2041
2042        if (is_journal_aborted(journal))
2043                return -EIO;
2044
2045        mutex_lock_io(&journal->j_checkpoint_mutex);
2046        if (!err) {
2047                err = jbd2_cleanup_journal_tail(journal);
2048                if (err < 0) {
2049                        mutex_unlock(&journal->j_checkpoint_mutex);
2050                        goto out;
2051                }
2052                err = 0;
2053        }
2054
2055        /* Finally, mark the journal as really needing no recovery.
2056         * This sets s_start==0 in the underlying superblock, which is
2057         * the magic code for a fully-recovered superblock.  Any future
2058         * commits of data to the journal will restore the current
2059         * s_start value. */
2060        jbd2_mark_journal_empty(journal, REQ_SYNC | REQ_FUA);
2061        mutex_unlock(&journal->j_checkpoint_mutex);
2062        write_lock(&journal->j_state_lock);
2063        J_ASSERT(!journal->j_running_transaction);
2064        J_ASSERT(!journal->j_committing_transaction);
2065        J_ASSERT(!journal->j_checkpoint_transactions);
2066        J_ASSERT(journal->j_head == journal->j_tail);
2067        J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
2068        write_unlock(&journal->j_state_lock);
2069out:
2070        return err;
2071}
2072
2073/**
2074 * int jbd2_journal_wipe() - Wipe journal contents
2075 * @journal: Journal to act on.
2076 * @write: flag (see below)
2077 *
2078 * Wipe out all of the contents of a journal, safely.  This will produce
2079 * a warning if the journal contains any valid recovery information.
2080 * Must be called between journal_init_*() and jbd2_journal_load().
2081 *
2082 * If 'write' is non-zero, then we wipe out the journal on disk; otherwise
2083 * we merely suppress recovery.
2084 */
2085
2086int jbd2_journal_wipe(journal_t *journal, int write)
2087{
2088        int err = 0;
2089
2090        J_ASSERT (!(journal->j_flags & JBD2_LOADED));
2091
2092        err = load_superblock(journal);
2093        if (err)
2094                return err;
2095
2096        if (!journal->j_tail)
2097                goto no_recovery;
2098
2099        printk(KERN_WARNING "JBD2: %s recovery information on journal\n",
2100                write ? "Clearing" : "Ignoring");
2101
2102        err = jbd2_journal_skip_recovery(journal);
2103        if (write) {
2104                /* Lock to make assertions happy... */
2105                mutex_lock_io(&journal->j_checkpoint_mutex);
2106                jbd2_mark_journal_empty(journal, REQ_SYNC | REQ_FUA);
2107                mutex_unlock(&journal->j_checkpoint_mutex);
2108        }
2109
2110 no_recovery:
2111        return err;
2112}
2113
2114/**
2115 * void jbd2_journal_abort () - Shutdown the journal immediately.
2116 * @journal: the journal to shutdown.
2117 * @errno:   an error number to record in the journal indicating
2118 *           the reason for the shutdown.
2119 *
2120 * Perform a complete, immediate shutdown of the ENTIRE
2121 * journal (not of a single transaction).  This operation cannot be
2122 * undone without closing and reopening the journal.
2123 *
2124 * The jbd2_journal_abort function is intended to support higher level error
2125 * recovery mechanisms such as the ext2/ext3 remount-readonly error
2126 * mode.
2127 *
2128 * Journal abort has very specific semantics.  Any existing dirty,
2129 * unjournaled buffers in the main filesystem will still be written to
2130 * disk by bdflush, but the journaling mechanism will be suspended
2131 * immediately and no further transaction commits will be honoured.
2132 *
2133 * Any dirty, journaled buffers will be written back to disk without
2134 * hitting the journal.  Atomicity cannot be guaranteed on an aborted
2135 * filesystem, but we _do_ attempt to leave as much data as possible
2136 * behind for fsck to use for cleanup.
2137 *
2138 * Any attempt to get a new transaction handle on a journal which is in
2139 * ABORT state will just result in an -EROFS error return.  A
2140 * jbd2_journal_stop on an existing handle will return -EIO if we have
2141 * entered abort state during the update.
2142 *
2143 * Recursive transactions are not disturbed by journal abort until the
2144 * final jbd2_journal_stop, which will receive the -EIO error.
2145 *
2146 * Finally, the jbd2_journal_abort call allows the caller to supply an errno
2147 * which will be recorded (if possible) in the journal superblock.  This
2148 * allows a client to record failure conditions in the middle of a
2149 * transaction without having to complete the transaction to record the
2150 * failure to disk.  ext3_error, for example, now uses this
2151 * functionality.
2152 *
2153 */
2154
2155void jbd2_journal_abort(journal_t *journal, int errno)
2156{
2157        transaction_t *transaction;
2158
2159        /*
2160         * Lock the aborting procedure until everything is done, this avoid
2161         * races between filesystem's error handling flow (e.g. ext4_abort()),
2162         * ensure panic after the error info is written into journal's
2163         * superblock.
2164         */
2165        mutex_lock(&journal->j_abort_mutex);
2166        /*
2167         * ESHUTDOWN always takes precedence because a file system check
2168         * caused by any other journal abort error is not required after
2169         * a shutdown triggered.
2170         */
2171        write_lock(&journal->j_state_lock);
2172        if (journal->j_flags & JBD2_ABORT) {
2173                int old_errno = journal->j_errno;
2174
2175                write_unlock(&journal->j_state_lock);
2176                if (old_errno != -ESHUTDOWN && errno == -ESHUTDOWN) {
2177                        journal->j_errno = errno;
2178                        jbd2_journal_update_sb_errno(journal);
2179                }
2180                mutex_unlock(&journal->j_abort_mutex);
2181                return;
2182        }
2183
2184        /*
2185         * Mark the abort as occurred and start current running transaction
2186         * to release all journaled buffer.
2187         */
2188        pr_err("Aborting journal on device %s.\n", journal->j_devname);
2189
2190        journal->j_flags |= JBD2_ABORT;
2191        journal->j_errno = errno;
2192        transaction = journal->j_running_transaction;
2193        if (transaction)
2194                __jbd2_log_start_commit(journal, transaction->t_tid);
2195        write_unlock(&journal->j_state_lock);
2196
2197        /*
2198         * Record errno to the journal super block, so that fsck and jbd2
2199         * layer could realise that a filesystem check is needed.
2200         */
2201        jbd2_journal_update_sb_errno(journal);
2202        mutex_unlock(&journal->j_abort_mutex);
2203}
2204
2205/**
2206 * int jbd2_journal_errno () - returns the journal's error state.
2207 * @journal: journal to examine.
2208 *
2209 * This is the errno number set with jbd2_journal_abort(), the last
2210 * time the journal was mounted - if the journal was stopped
2211 * without calling abort this will be 0.
2212 *
2213 * If the journal has been aborted on this mount time -EROFS will
2214 * be returned.
2215 */
2216int jbd2_journal_errno(journal_t *journal)
2217{
2218        int err;
2219
2220        read_lock(&journal->j_state_lock);
2221        if (journal->j_flags & JBD2_ABORT)
2222                err = -EROFS;
2223        else
2224                err = journal->j_errno;
2225        read_unlock(&journal->j_state_lock);
2226        return err;
2227}
2228
2229/**
2230 * int jbd2_journal_clear_err () - clears the journal's error state
2231 * @journal: journal to act on.
2232 *
2233 * An error must be cleared or acked to take a FS out of readonly
2234 * mode.
2235 */
2236int jbd2_journal_clear_err(journal_t *journal)
2237{
2238        int err = 0;
2239
2240        write_lock(&journal->j_state_lock);
2241        if (journal->j_flags & JBD2_ABORT)
2242                err = -EROFS;
2243        else
2244                journal->j_errno = 0;
2245        write_unlock(&journal->j_state_lock);
2246        return err;
2247}
2248
2249/**
2250 * void jbd2_journal_ack_err() - Ack journal err.
2251 * @journal: journal to act on.
2252 *
2253 * An error must be cleared or acked to take a FS out of readonly
2254 * mode.
2255 */
2256void jbd2_journal_ack_err(journal_t *journal)
2257{
2258        write_lock(&journal->j_state_lock);
2259        if (journal->j_errno)
2260                journal->j_flags |= JBD2_ACK_ERR;
2261        write_unlock(&journal->j_state_lock);
2262}
2263
2264int jbd2_journal_blocks_per_page(struct inode *inode)
2265{
2266        return 1 << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
2267}
2268
2269/*
2270 * helper functions to deal with 32 or 64bit block numbers.
2271 */
2272size_t journal_tag_bytes(journal_t *journal)
2273{
2274        size_t sz;
2275
2276        if (jbd2_has_feature_csum3(journal))
2277                return sizeof(journal_block_tag3_t);
2278
2279        sz = sizeof(journal_block_tag_t);
2280
2281        if (jbd2_has_feature_csum2(journal))
2282                sz += sizeof(__u16);
2283
2284        if (jbd2_has_feature_64bit(journal))
2285                return sz;
2286        else
2287                return sz - sizeof(__u32);
2288}
2289
2290/*
2291 * JBD memory management
2292 *
2293 * These functions are used to allocate block-sized chunks of memory
2294 * used for making copies of buffer_head data.  Very often it will be
2295 * page-sized chunks of data, but sometimes it will be in
2296 * sub-page-size chunks.  (For example, 16k pages on Power systems
2297 * with a 4k block file system.)  For blocks smaller than a page, we
2298 * use a SLAB allocator.  There are slab caches for each block size,
2299 * which are allocated at mount time, if necessary, and we only free
2300 * (all of) the slab caches when/if the jbd2 module is unloaded.  For
2301 * this reason we don't need to a mutex to protect access to
2302 * jbd2_slab[] allocating or releasing memory; only in
2303 * jbd2_journal_create_slab().
2304 */
2305#define JBD2_MAX_SLABS 8
2306static struct kmem_cache *jbd2_slab[JBD2_MAX_SLABS];
2307
2308static const char *jbd2_slab_names[JBD2_MAX_SLABS] = {
2309        "jbd2_1k", "jbd2_2k", "jbd2_4k", "jbd2_8k",
2310        "jbd2_16k", "jbd2_32k", "jbd2_64k", "jbd2_128k"
2311};
2312
2313
2314static void jbd2_journal_destroy_slabs(void)
2315{
2316        int i;
2317
2318        for (i = 0; i < JBD2_MAX_SLABS; i++) {
2319                kmem_cache_destroy(jbd2_slab[i]);
2320                jbd2_slab[i] = NULL;
2321        }
2322}
2323
2324static int jbd2_journal_create_slab(size_t size)
2325{
2326        static DEFINE_MUTEX(jbd2_slab_create_mutex);
2327        int i = order_base_2(size) - 10;
2328        size_t slab_size;
2329
2330        if (size == PAGE_SIZE)
2331                return 0;
2332
2333        if (i >= JBD2_MAX_SLABS)
2334                return -EINVAL;
2335
2336        if (unlikely(i < 0))
2337                i = 0;
2338        mutex_lock(&jbd2_slab_create_mutex);
2339        if (jbd2_slab[i]) {
2340                mutex_unlock(&jbd2_slab_create_mutex);
2341                return 0;       /* Already created */
2342        }
2343
2344        slab_size = 1 << (i+10);
2345        jbd2_slab[i] = kmem_cache_create(jbd2_slab_names[i], slab_size,
2346                                         slab_size, 0, NULL);
2347        mutex_unlock(&jbd2_slab_create_mutex);
2348        if (!jbd2_slab[i]) {
2349                printk(KERN_EMERG "JBD2: no memory for jbd2_slab cache\n");
2350                return -ENOMEM;
2351        }
2352        return 0;
2353}
2354
2355static struct kmem_cache *get_slab(size_t size)
2356{
2357        int i = order_base_2(size) - 10;
2358
2359        BUG_ON(i >= JBD2_MAX_SLABS);
2360        if (unlikely(i < 0))
2361                i = 0;
2362        BUG_ON(jbd2_slab[i] == NULL);
2363        return jbd2_slab[i];
2364}
2365
2366void *jbd2_alloc(size_t size, gfp_t flags)
2367{
2368        void *ptr;
2369
2370        BUG_ON(size & (size-1)); /* Must be a power of 2 */
2371
2372        if (size < PAGE_SIZE)
2373                ptr = kmem_cache_alloc(get_slab(size), flags);
2374        else
2375                ptr = (void *)__get_free_pages(flags, get_order(size));
2376
2377        /* Check alignment; SLUB has gotten this wrong in the past,
2378         * and this can lead to user data corruption! */
2379        BUG_ON(((unsigned long) ptr) & (size-1));
2380
2381        return ptr;
2382}
2383
2384void jbd2_free(void *ptr, size_t size)
2385{
2386        if (size < PAGE_SIZE)
2387                kmem_cache_free(get_slab(size), ptr);
2388        else
2389                free_pages((unsigned long)ptr, get_order(size));
2390};
2391
2392/*
2393 * Journal_head storage management
2394 */
2395static struct kmem_cache *jbd2_journal_head_cache;
2396#ifdef CONFIG_JBD2_DEBUG
2397static atomic_t nr_journal_heads = ATOMIC_INIT(0);
2398#endif
2399
2400static int __init jbd2_journal_init_journal_head_cache(void)
2401{
2402        J_ASSERT(!jbd2_journal_head_cache);
2403        jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head",
2404                                sizeof(struct journal_head),
2405                                0,              /* offset */
2406                                SLAB_TEMPORARY | SLAB_TYPESAFE_BY_RCU,
2407                                NULL);          /* ctor */
2408        if (!jbd2_journal_head_cache) {
2409                printk(KERN_EMERG "JBD2: no memory for journal_head cache\n");
2410                return -ENOMEM;
2411        }
2412        return 0;
2413}
2414
2415static void jbd2_journal_destroy_journal_head_cache(void)
2416{
2417        kmem_cache_destroy(jbd2_journal_head_cache);
2418        jbd2_journal_head_cache = NULL;
2419}
2420
2421/*
2422 * journal_head splicing and dicing
2423 */
2424static struct journal_head *journal_alloc_journal_head(void)
2425{
2426        struct journal_head *ret;
2427
2428#ifdef CONFIG_JBD2_DEBUG
2429        atomic_inc(&nr_journal_heads);
2430#endif
2431        ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS);
2432        if (!ret) {
2433                jbd_debug(1, "out of memory for journal_head\n");
2434                pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__);
2435                ret = kmem_cache_zalloc(jbd2_journal_head_cache,
2436                                GFP_NOFS | __GFP_NOFAIL);
2437        }
2438        return ret;
2439}
2440
2441static void journal_free_journal_head(struct journal_head *jh)
2442{
2443#ifdef CONFIG_JBD2_DEBUG
2444        atomic_dec(&nr_journal_heads);
2445        memset(jh, JBD2_POISON_FREE, sizeof(*jh));
2446#endif
2447        kmem_cache_free(jbd2_journal_head_cache, jh);
2448}
2449
2450/*
2451 * A journal_head is attached to a buffer_head whenever JBD has an
2452 * interest in the buffer.
2453 *
2454 * Whenever a buffer has an attached journal_head, its ->b_state:BH_JBD bit
2455 * is set.  This bit is tested in core kernel code where we need to take
2456 * JBD-specific actions.  Testing the zeroness of ->b_private is not reliable
2457 * there.
2458 *
2459 * When a buffer has its BH_JBD bit set, its ->b_count is elevated by one.
2460 *
2461 * When a buffer has its BH_JBD bit set it is immune from being released by
2462 * core kernel code, mainly via ->b_count.
2463 *
2464 * A journal_head is detached from its buffer_head when the journal_head's
2465 * b_jcount reaches zero. Running transaction (b_transaction) and checkpoint
2466 * transaction (b_cp_transaction) hold their references to b_jcount.
2467 *
2468 * Various places in the kernel want to attach a journal_head to a buffer_head
2469 * _before_ attaching the journal_head to a transaction.  To protect the
2470 * journal_head in this situation, jbd2_journal_add_journal_head elevates the
2471 * journal_head's b_jcount refcount by one.  The caller must call
2472 * jbd2_journal_put_journal_head() to undo this.
2473 *
2474 * So the typical usage would be:
2475 *
2476 *      (Attach a journal_head if needed.  Increments b_jcount)
2477 *      struct journal_head *jh = jbd2_journal_add_journal_head(bh);
2478 *      ...
2479 *      (Get another reference for transaction)
2480 *      jbd2_journal_grab_journal_head(bh);
2481 *      jh->b_transaction = xxx;
2482 *      (Put original reference)
2483 *      jbd2_journal_put_journal_head(jh);
2484 */
2485
2486/*
2487 * Give a buffer_head a journal_head.
2488 *
2489 * May sleep.
2490 */
2491struct journal_head *jbd2_journal_add_journal_head(struct buffer_head *bh)
2492{
2493        struct journal_head *jh;
2494        struct journal_head *new_jh = NULL;
2495
2496repeat:
2497        if (!buffer_jbd(bh))
2498                new_jh = journal_alloc_journal_head();
2499
2500        jbd_lock_bh_journal_head(bh);
2501        if (buffer_jbd(bh)) {
2502                jh = bh2jh(bh);
2503        } else {
2504                J_ASSERT_BH(bh,
2505                        (atomic_read(&bh->b_count) > 0) ||
2506                        (bh->b_page && bh->b_page->mapping));
2507
2508                if (!new_jh) {
2509                        jbd_unlock_bh_journal_head(bh);
2510                        goto repeat;
2511                }
2512
2513                jh = new_jh;
2514                new_jh = NULL;          /* We consumed it */
2515                set_buffer_jbd(bh);
2516                bh->b_private = jh;
2517                jh->b_bh = bh;
2518                get_bh(bh);
2519                BUFFER_TRACE(bh, "added journal_head");
2520        }
2521        jh->b_jcount++;
2522        jbd_unlock_bh_journal_head(bh);
2523        if (new_jh)
2524                journal_free_journal_head(new_jh);
2525        return bh->b_private;
2526}
2527
2528/*
2529 * Grab a ref against this buffer_head's journal_head.  If it ended up not
2530 * having a journal_head, return NULL
2531 */
2532struct journal_head *jbd2_journal_grab_journal_head(struct buffer_head *bh)
2533{
2534        struct journal_head *jh = NULL;
2535
2536        jbd_lock_bh_journal_head(bh);
2537        if (buffer_jbd(bh)) {
2538                jh = bh2jh(bh);
2539                jh->b_jcount++;
2540        }
2541        jbd_unlock_bh_journal_head(bh);
2542        return jh;
2543}
2544
2545static void __journal_remove_journal_head(struct buffer_head *bh)
2546{
2547        struct journal_head *jh = bh2jh(bh);
2548
2549        J_ASSERT_JH(jh, jh->b_transaction == NULL);
2550        J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
2551        J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
2552        J_ASSERT_JH(jh, jh->b_jlist == BJ_None);
2553        J_ASSERT_BH(bh, buffer_jbd(bh));
2554        J_ASSERT_BH(bh, jh2bh(jh) == bh);
2555        BUFFER_TRACE(bh, "remove journal_head");
2556        if (jh->b_frozen_data) {
2557                printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__);
2558                jbd2_free(jh->b_frozen_data, bh->b_size);
2559        }
2560        if (jh->b_committed_data) {
2561                printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__);
2562                jbd2_free(jh->b_committed_data, bh->b_size);
2563        }
2564        bh->b_private = NULL;
2565        jh->b_bh = NULL;        /* debug, really */
2566        clear_buffer_jbd(bh);
2567        journal_free_journal_head(jh);
2568}
2569
2570/*
2571 * Drop a reference on the passed journal_head.  If it fell to zero then
2572 * release the journal_head from the buffer_head.
2573 */
2574void jbd2_journal_put_journal_head(struct journal_head *jh)
2575{
2576        struct buffer_head *bh = jh2bh(jh);
2577
2578        jbd_lock_bh_journal_head(bh);
2579        J_ASSERT_JH(jh, jh->b_jcount > 0);
2580        --jh->b_jcount;
2581        if (!jh->b_jcount) {
2582                __journal_remove_journal_head(bh);
2583                jbd_unlock_bh_journal_head(bh);
2584                __brelse(bh);
2585        } else
2586                jbd_unlock_bh_journal_head(bh);
2587}
2588
2589/*
2590 * Initialize jbd inode head
2591 */
2592void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode)
2593{
2594        jinode->i_transaction = NULL;
2595        jinode->i_next_transaction = NULL;
2596        jinode->i_vfs_inode = inode;
2597        jinode->i_flags = 0;
2598        jinode->i_dirty_start = 0;
2599        jinode->i_dirty_end = 0;
2600        INIT_LIST_HEAD(&jinode->i_list);
2601}
2602
2603/*
2604 * Function to be called before we start removing inode from memory (i.e.,
2605 * clear_inode() is a fine place to be called from). It removes inode from
2606 * transaction's lists.
2607 */
2608void jbd2_journal_release_jbd_inode(journal_t *journal,
2609                                    struct jbd2_inode *jinode)
2610{
2611        if (!journal)
2612                return;
2613restart:
2614        spin_lock(&journal->j_list_lock);
2615        /* Is commit writing out inode - we have to wait */
2616        if (jinode->i_flags & JI_COMMIT_RUNNING) {
2617                wait_queue_head_t *wq;
2618                DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
2619                wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
2620                prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
2621                spin_unlock(&journal->j_list_lock);
2622                schedule();
2623                finish_wait(wq, &wait.wq_entry);
2624                goto restart;
2625        }
2626
2627        if (jinode->i_transaction) {
2628                list_del(&jinode->i_list);
2629                jinode->i_transaction = NULL;
2630        }
2631        spin_unlock(&journal->j_list_lock);
2632}
2633
2634
2635#ifdef CONFIG_PROC_FS
2636
2637#define JBD2_STATS_PROC_NAME "fs/jbd2"
2638
2639static void __init jbd2_create_jbd_stats_proc_entry(void)
2640{
2641        proc_jbd2_stats = proc_mkdir(JBD2_STATS_PROC_NAME, NULL);
2642}
2643
2644static void __exit jbd2_remove_jbd_stats_proc_entry(void)
2645{
2646        if (proc_jbd2_stats)
2647                remove_proc_entry(JBD2_STATS_PROC_NAME, NULL);
2648}
2649
2650#else
2651
2652#define jbd2_create_jbd_stats_proc_entry() do {} while (0)
2653#define jbd2_remove_jbd_stats_proc_entry() do {} while (0)
2654
2655#endif
2656
2657struct kmem_cache *jbd2_handle_cache, *jbd2_inode_cache;
2658
2659static int __init jbd2_journal_init_inode_cache(void)
2660{
2661        J_ASSERT(!jbd2_inode_cache);
2662        jbd2_inode_cache = KMEM_CACHE(jbd2_inode, 0);
2663        if (!jbd2_inode_cache) {
2664                pr_emerg("JBD2: failed to create inode cache\n");
2665                return -ENOMEM;
2666        }
2667        return 0;
2668}
2669
2670static int __init jbd2_journal_init_handle_cache(void)
2671{
2672        J_ASSERT(!jbd2_handle_cache);
2673        jbd2_handle_cache = KMEM_CACHE(jbd2_journal_handle, SLAB_TEMPORARY);
2674        if (!jbd2_handle_cache) {
2675                printk(KERN_EMERG "JBD2: failed to create handle cache\n");
2676                return -ENOMEM;
2677        }
2678        return 0;
2679}
2680
2681static void jbd2_journal_destroy_inode_cache(void)
2682{
2683        kmem_cache_destroy(jbd2_inode_cache);
2684        jbd2_inode_cache = NULL;
2685}
2686
2687static void jbd2_journal_destroy_handle_cache(void)
2688{
2689        kmem_cache_destroy(jbd2_handle_cache);
2690        jbd2_handle_cache = NULL;
2691}
2692
2693/*
2694 * Module startup and shutdown
2695 */
2696
2697static int __init journal_init_caches(void)
2698{
2699        int ret;
2700
2701        ret = jbd2_journal_init_revoke_record_cache();
2702        if (ret == 0)
2703                ret = jbd2_journal_init_revoke_table_cache();
2704        if (ret == 0)
2705                ret = jbd2_journal_init_journal_head_cache();
2706        if (ret == 0)
2707                ret = jbd2_journal_init_handle_cache();
2708        if (ret == 0)
2709                ret = jbd2_journal_init_inode_cache();
2710        if (ret == 0)
2711                ret = jbd2_journal_init_transaction_cache();
2712        return ret;
2713}
2714
2715static void jbd2_journal_destroy_caches(void)
2716{
2717        jbd2_journal_destroy_revoke_record_cache();
2718        jbd2_journal_destroy_revoke_table_cache();
2719        jbd2_journal_destroy_journal_head_cache();
2720        jbd2_journal_destroy_handle_cache();
2721        jbd2_journal_destroy_inode_cache();
2722        jbd2_journal_destroy_transaction_cache();
2723        jbd2_journal_destroy_slabs();
2724}
2725
2726static int __init journal_init(void)
2727{
2728        int ret;
2729
2730        BUILD_BUG_ON(sizeof(struct journal_superblock_s) != 1024);
2731
2732        ret = journal_init_caches();
2733        if (ret == 0) {
2734                jbd2_create_jbd_stats_proc_entry();
2735        } else {
2736                jbd2_journal_destroy_caches();
2737        }
2738        return ret;
2739}
2740
2741static void __exit journal_exit(void)
2742{
2743#ifdef CONFIG_JBD2_DEBUG
2744        int n = atomic_read(&nr_journal_heads);
2745        if (n)
2746                printk(KERN_ERR "JBD2: leaked %d journal_heads!\n", n);
2747#endif
2748        jbd2_remove_jbd_stats_proc_entry();
2749        jbd2_journal_destroy_caches();
2750}
2751
2752MODULE_LICENSE("GPL");
2753module_init(journal_init);
2754module_exit(journal_exit);
2755
2756