linux/fs/reiserfs/journal.c
<<
>>
Prefs
   1/*
   2** Write ahead logging implementation copyright Chris Mason 2000
   3**
   4** The background commits make this code very interrelated, and
   5** overly complex.  I need to rethink things a bit....The major players:
   6**
   7** journal_begin -- call with the number of blocks you expect to log.
   8**                  If the current transaction is too
   9**                  old, it will block until the current transaction is
  10**                  finished, and then start a new one.
  11**                  Usually, your transaction will get joined in with
  12**                  previous ones for speed.
  13**
  14** journal_join  -- same as journal_begin, but won't block on the current
  15**                  transaction regardless of age.  Don't ever call
  16**                  this.  Ever.  There are only two places it should be
  17**                  called from, and they are both inside this file.
  18**
  19** journal_mark_dirty -- adds blocks into this transaction.  clears any flags
  20**                       that might make them get sent to disk
  21**                       and then marks them BH_JDirty.  Puts the buffer head
  22**                       into the current transaction hash.
  23**
  24** journal_end -- if the current transaction is batchable, it does nothing
  25**                   otherwise, it could do an async/synchronous commit, or
  26**                   a full flush of all log and real blocks in the
  27**                   transaction.
  28**
  29** flush_old_commits -- if the current transaction is too old, it is ended and
  30**                      commit blocks are sent to disk.  Forces commit blocks
  31**                      to disk for all backgrounded commits that have been
  32**                      around too long.
  33**                   -- Note, if you call this as an immediate flush from
  34**                      from within kupdate, it will ignore the immediate flag
  35*/
  36
  37#include <linux/time.h>
  38#include <linux/semaphore.h>
  39#include <linux/vmalloc.h>
  40#include <linux/reiserfs_fs.h>
  41#include <linux/kernel.h>
  42#include <linux/errno.h>
  43#include <linux/fcntl.h>
  44#include <linux/stat.h>
  45#include <linux/string.h>
  46#include <linux/buffer_head.h>
  47#include <linux/workqueue.h>
  48#include <linux/writeback.h>
  49#include <linux/blkdev.h>
  50#include <linux/backing-dev.h>
  51#include <linux/uaccess.h>
  52#include <linux/slab.h>
  53
  54#include <asm/system.h>
  55
  56/* gets a struct reiserfs_journal_list * from a list head */
  57#define JOURNAL_LIST_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \
  58                               j_list))
  59#define JOURNAL_WORK_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \
  60                               j_working_list))
  61
  62/* the number of mounted filesystems.  This is used to decide when to
  63** start and kill the commit workqueue
  64*/
  65static int reiserfs_mounted_fs_count;
  66
  67static struct workqueue_struct *commit_wq;
  68
  69#define JOURNAL_TRANS_HALF 1018 /* must be correct to keep the desc and commit
  70                                   structs at 4k */
  71#define BUFNR 64                /*read ahead */
  72
  73/* cnode stat bits.  Move these into reiserfs_fs.h */
  74
  75#define BLOCK_FREED 2           /* this block was freed, and can't be written.  */
  76#define BLOCK_FREED_HOLDER 3    /* this block was freed during this transaction, and can't be written */
  77
  78#define BLOCK_NEEDS_FLUSH 4     /* used in flush_journal_list */
  79#define BLOCK_DIRTIED 5
  80
  81/* journal list state bits */
  82#define LIST_TOUCHED 1
  83#define LIST_DIRTY   2
  84#define LIST_COMMIT_PENDING  4  /* someone will commit this list */
  85
  86/* flags for do_journal_end */
  87#define FLUSH_ALL   1           /* flush commit and real blocks */
  88#define COMMIT_NOW  2           /* end and commit this transaction */
  89#define WAIT        4           /* wait for the log blocks to hit the disk */
  90
  91static int do_journal_end(struct reiserfs_transaction_handle *,
  92                          struct super_block *, unsigned long nblocks,
  93                          int flags);
  94static int flush_journal_list(struct super_block *s,
  95                              struct reiserfs_journal_list *jl, int flushall);
  96static int flush_commit_list(struct super_block *s,
  97                             struct reiserfs_journal_list *jl, int flushall);
  98static int can_dirty(struct reiserfs_journal_cnode *cn);
  99static int journal_join(struct reiserfs_transaction_handle *th,
 100                        struct super_block *sb, unsigned long nblocks);
 101static int release_journal_dev(struct super_block *super,
 102                               struct reiserfs_journal *journal);
 103static int dirty_one_transaction(struct super_block *s,
 104                                 struct reiserfs_journal_list *jl);
 105static void flush_async_commits(struct work_struct *work);
 106static void queue_log_writer(struct super_block *s);
 107
 108/* values for join in do_journal_begin_r */
 109enum {
 110        JBEGIN_REG = 0,         /* regular journal begin */
 111        JBEGIN_JOIN = 1,        /* join the running transaction if at all possible */
 112        JBEGIN_ABORT = 2,       /* called from cleanup code, ignores aborted flag */
 113};
 114
 115static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
 116                              struct super_block *sb,
 117                              unsigned long nblocks, int join);
 118
 119static void init_journal_hash(struct super_block *sb)
 120{
 121        struct reiserfs_journal *journal = SB_JOURNAL(sb);
 122        memset(journal->j_hash_table, 0,
 123               JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *));
 124}
 125
 126/*
 127** clears BH_Dirty and sticks the buffer on the clean list.  Called because I can't allow refile_buffer to
 128** make schedule happen after I've freed a block.  Look at remove_from_transaction and journal_mark_freed for
 129** more details.
 130*/
 131static int reiserfs_clean_and_file_buffer(struct buffer_head *bh)
 132{
 133        if (bh) {
 134                clear_buffer_dirty(bh);
 135                clear_buffer_journal_test(bh);
 136        }
 137        return 0;
 138}
 139
 140static struct reiserfs_bitmap_node *allocate_bitmap_node(struct super_block
 141                                                         *sb)
 142{
 143        struct reiserfs_bitmap_node *bn;
 144        static int id;
 145
 146        bn = kmalloc(sizeof(struct reiserfs_bitmap_node), GFP_NOFS);
 147        if (!bn) {
 148                return NULL;
 149        }
 150        bn->data = kzalloc(sb->s_blocksize, GFP_NOFS);
 151        if (!bn->data) {
 152                kfree(bn);
 153                return NULL;
 154        }
 155        bn->id = id++;
 156        INIT_LIST_HEAD(&bn->list);
 157        return bn;
 158}
 159
 160static struct reiserfs_bitmap_node *get_bitmap_node(struct super_block *sb)
 161{
 162        struct reiserfs_journal *journal = SB_JOURNAL(sb);
 163        struct reiserfs_bitmap_node *bn = NULL;
 164        struct list_head *entry = journal->j_bitmap_nodes.next;
 165
 166        journal->j_used_bitmap_nodes++;
 167      repeat:
 168
 169        if (entry != &journal->j_bitmap_nodes) {
 170                bn = list_entry(entry, struct reiserfs_bitmap_node, list);
 171                list_del(entry);
 172                memset(bn->data, 0, sb->s_blocksize);
 173                journal->j_free_bitmap_nodes--;
 174                return bn;
 175        }
 176        bn = allocate_bitmap_node(sb);
 177        if (!bn) {
 178                yield();
 179                goto repeat;
 180        }
 181        return bn;
 182}
 183static inline void free_bitmap_node(struct super_block *sb,
 184                                    struct reiserfs_bitmap_node *bn)
 185{
 186        struct reiserfs_journal *journal = SB_JOURNAL(sb);
 187        journal->j_used_bitmap_nodes--;
 188        if (journal->j_free_bitmap_nodes > REISERFS_MAX_BITMAP_NODES) {
 189                kfree(bn->data);
 190                kfree(bn);
 191        } else {
 192                list_add(&bn->list, &journal->j_bitmap_nodes);
 193                journal->j_free_bitmap_nodes++;
 194        }
 195}
 196
 197static void allocate_bitmap_nodes(struct super_block *sb)
 198{
 199        int i;
 200        struct reiserfs_journal *journal = SB_JOURNAL(sb);
 201        struct reiserfs_bitmap_node *bn = NULL;
 202        for (i = 0; i < REISERFS_MIN_BITMAP_NODES; i++) {
 203                bn = allocate_bitmap_node(sb);
 204                if (bn) {
 205                        list_add(&bn->list, &journal->j_bitmap_nodes);
 206                        journal->j_free_bitmap_nodes++;
 207                } else {
 208                        break;  /* this is ok, we'll try again when more are needed */
 209                }
 210        }
 211}
 212
 213static int set_bit_in_list_bitmap(struct super_block *sb,
 214                                  b_blocknr_t block,
 215                                  struct reiserfs_list_bitmap *jb)
 216{
 217        unsigned int bmap_nr = block / (sb->s_blocksize << 3);
 218        unsigned int bit_nr = block % (sb->s_blocksize << 3);
 219
 220        if (!jb->bitmaps[bmap_nr]) {
 221                jb->bitmaps[bmap_nr] = get_bitmap_node(sb);
 222        }
 223        set_bit(bit_nr, (unsigned long *)jb->bitmaps[bmap_nr]->data);
 224        return 0;
 225}
 226
 227static void cleanup_bitmap_list(struct super_block *sb,
 228                                struct reiserfs_list_bitmap *jb)
 229{
 230        int i;
 231        if (jb->bitmaps == NULL)
 232                return;
 233
 234        for (i = 0; i < reiserfs_bmap_count(sb); i++) {
 235                if (jb->bitmaps[i]) {
 236                        free_bitmap_node(sb, jb->bitmaps[i]);
 237                        jb->bitmaps[i] = NULL;
 238                }
 239        }
 240}
 241
 242/*
 243** only call this on FS unmount.
 244*/
 245static int free_list_bitmaps(struct super_block *sb,
 246                             struct reiserfs_list_bitmap *jb_array)
 247{
 248        int i;
 249        struct reiserfs_list_bitmap *jb;
 250        for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
 251                jb = jb_array + i;
 252                jb->journal_list = NULL;
 253                cleanup_bitmap_list(sb, jb);
 254                vfree(jb->bitmaps);
 255                jb->bitmaps = NULL;
 256        }
 257        return 0;
 258}
 259
 260static int free_bitmap_nodes(struct super_block *sb)
 261{
 262        struct reiserfs_journal *journal = SB_JOURNAL(sb);
 263        struct list_head *next = journal->j_bitmap_nodes.next;
 264        struct reiserfs_bitmap_node *bn;
 265
 266        while (next != &journal->j_bitmap_nodes) {
 267                bn = list_entry(next, struct reiserfs_bitmap_node, list);
 268                list_del(next);
 269                kfree(bn->data);
 270                kfree(bn);
 271                next = journal->j_bitmap_nodes.next;
 272                journal->j_free_bitmap_nodes--;
 273        }
 274
 275        return 0;
 276}
 277
 278/*
 279** get memory for JOURNAL_NUM_BITMAPS worth of bitmaps.
 280** jb_array is the array to be filled in.
 281*/
 282int reiserfs_allocate_list_bitmaps(struct super_block *sb,
 283                                   struct reiserfs_list_bitmap *jb_array,
 284                                   unsigned int bmap_nr)
 285{
 286        int i;
 287        int failed = 0;
 288        struct reiserfs_list_bitmap *jb;
 289        int mem = bmap_nr * sizeof(struct reiserfs_bitmap_node *);
 290
 291        for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
 292                jb = jb_array + i;
 293                jb->journal_list = NULL;
 294                jb->bitmaps = vzalloc(mem);
 295                if (!jb->bitmaps) {
 296                        reiserfs_warning(sb, "clm-2000", "unable to "
 297                                         "allocate bitmaps for journal lists");
 298                        failed = 1;
 299                        break;
 300                }
 301        }
 302        if (failed) {
 303                free_list_bitmaps(sb, jb_array);
 304                return -1;
 305        }
 306        return 0;
 307}
 308
 309/*
 310** find an available list bitmap.  If you can't find one, flush a commit list
 311** and try again
 312*/
 313static struct reiserfs_list_bitmap *get_list_bitmap(struct super_block *sb,
 314                                                    struct reiserfs_journal_list
 315                                                    *jl)
 316{
 317        int i, j;
 318        struct reiserfs_journal *journal = SB_JOURNAL(sb);
 319        struct reiserfs_list_bitmap *jb = NULL;
 320
 321        for (j = 0; j < (JOURNAL_NUM_BITMAPS * 3); j++) {
 322                i = journal->j_list_bitmap_index;
 323                journal->j_list_bitmap_index = (i + 1) % JOURNAL_NUM_BITMAPS;
 324                jb = journal->j_list_bitmap + i;
 325                if (journal->j_list_bitmap[i].journal_list) {
 326                        flush_commit_list(sb,
 327                                          journal->j_list_bitmap[i].
 328                                          journal_list, 1);
 329                        if (!journal->j_list_bitmap[i].journal_list) {
 330                                break;
 331                        }
 332                } else {
 333                        break;
 334                }
 335        }
 336        if (jb->journal_list) { /* double check to make sure if flushed correctly */
 337                return NULL;
 338        }
 339        jb->journal_list = jl;
 340        return jb;
 341}
 342
 343/*
 344** allocates a new chunk of X nodes, and links them all together as a list.
 345** Uses the cnode->next and cnode->prev pointers
 346** returns NULL on failure
 347*/
 348static struct reiserfs_journal_cnode *allocate_cnodes(int num_cnodes)
 349{
 350        struct reiserfs_journal_cnode *head;
 351        int i;
 352        if (num_cnodes <= 0) {
 353                return NULL;
 354        }
 355        head = vzalloc(num_cnodes * sizeof(struct reiserfs_journal_cnode));
 356        if (!head) {
 357                return NULL;
 358        }
 359        head[0].prev = NULL;
 360        head[0].next = head + 1;
 361        for (i = 1; i < num_cnodes; i++) {
 362                head[i].prev = head + (i - 1);
 363                head[i].next = head + (i + 1);  /* if last one, overwrite it after the if */
 364        }
 365        head[num_cnodes - 1].next = NULL;
 366        return head;
 367}
 368
 369/*
 370** pulls a cnode off the free list, or returns NULL on failure
 371*/
 372static struct reiserfs_journal_cnode *get_cnode(struct super_block *sb)
 373{
 374        struct reiserfs_journal_cnode *cn;
 375        struct reiserfs_journal *journal = SB_JOURNAL(sb);
 376
 377        reiserfs_check_lock_depth(sb, "get_cnode");
 378
 379        if (journal->j_cnode_free <= 0) {
 380                return NULL;
 381        }
 382        journal->j_cnode_used++;
 383        journal->j_cnode_free--;
 384        cn = journal->j_cnode_free_list;
 385        if (!cn) {
 386                return cn;
 387        }
 388        if (cn->next) {
 389                cn->next->prev = NULL;
 390        }
 391        journal->j_cnode_free_list = cn->next;
 392        memset(cn, 0, sizeof(struct reiserfs_journal_cnode));
 393        return cn;
 394}
 395
 396/*
 397** returns a cnode to the free list
 398*/
 399static void free_cnode(struct super_block *sb,
 400                       struct reiserfs_journal_cnode *cn)
 401{
 402        struct reiserfs_journal *journal = SB_JOURNAL(sb);
 403
 404        reiserfs_check_lock_depth(sb, "free_cnode");
 405
 406        journal->j_cnode_used--;
 407        journal->j_cnode_free++;
 408        /* memset(cn, 0, sizeof(struct reiserfs_journal_cnode)) ; */
 409        cn->next = journal->j_cnode_free_list;
 410        if (journal->j_cnode_free_list) {
 411                journal->j_cnode_free_list->prev = cn;
 412        }
 413        cn->prev = NULL;        /* not needed with the memset, but I might kill the memset, and forget to do this */
 414        journal->j_cnode_free_list = cn;
 415}
 416
 417static void clear_prepared_bits(struct buffer_head *bh)
 418{
 419        clear_buffer_journal_prepared(bh);
 420        clear_buffer_journal_restore_dirty(bh);
 421}
 422
 423/* return a cnode with same dev, block number and size in table, or null if not found */
 424static inline struct reiserfs_journal_cnode *get_journal_hash_dev(struct
 425                                                                  super_block
 426                                                                  *sb,
 427                                                                  struct
 428                                                                  reiserfs_journal_cnode
 429                                                                  **table,
 430                                                                  long bl)
 431{
 432        struct reiserfs_journal_cnode *cn;
 433        cn = journal_hash(table, sb, bl);
 434        while (cn) {
 435                if (cn->blocknr == bl && cn->sb == sb)
 436                        return cn;
 437                cn = cn->hnext;
 438        }
 439        return (struct reiserfs_journal_cnode *)0;
 440}
 441
 442/*
 443** this actually means 'can this block be reallocated yet?'.  If you set search_all, a block can only be allocated
 444** if it is not in the current transaction, was not freed by the current transaction, and has no chance of ever
 445** being overwritten by a replay after crashing.
 446**
 447** If you don't set search_all, a block can only be allocated if it is not in the current transaction.  Since deleting
 448** a block removes it from the current transaction, this case should never happen.  If you don't set search_all, make
 449** sure you never write the block without logging it.
 450**
 451** next_zero_bit is a suggestion about the next block to try for find_forward.
 452** when bl is rejected because it is set in a journal list bitmap, we search
 453** for the next zero bit in the bitmap that rejected bl.  Then, we return that
 454** through next_zero_bit for find_forward to try.
 455**
 456** Just because we return something in next_zero_bit does not mean we won't
 457** reject it on the next call to reiserfs_in_journal
 458**
 459*/
 460int reiserfs_in_journal(struct super_block *sb,
 461                        unsigned int bmap_nr, int bit_nr, int search_all,
 462                        b_blocknr_t * next_zero_bit)
 463{
 464        struct reiserfs_journal *journal = SB_JOURNAL(sb);
 465        struct reiserfs_journal_cnode *cn;
 466        struct reiserfs_list_bitmap *jb;
 467        int i;
 468        unsigned long bl;
 469
 470        *next_zero_bit = 0;     /* always start this at zero. */
 471
 472        PROC_INFO_INC(sb, journal.in_journal);
 473        /* If we aren't doing a search_all, this is a metablock, and it will be logged before use.
 474         ** if we crash before the transaction that freed it commits,  this transaction won't
 475         ** have committed either, and the block will never be written
 476         */
 477        if (search_all) {
 478                for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
 479                        PROC_INFO_INC(sb, journal.in_journal_bitmap);
 480                        jb = journal->j_list_bitmap + i;
 481                        if (jb->journal_list && jb->bitmaps[bmap_nr] &&
 482                            test_bit(bit_nr,
 483                                     (unsigned long *)jb->bitmaps[bmap_nr]->
 484                                     data)) {
 485                                *next_zero_bit =
 486                                    find_next_zero_bit((unsigned long *)
 487                                                       (jb->bitmaps[bmap_nr]->
 488                                                        data),
 489                                                       sb->s_blocksize << 3,
 490                                                       bit_nr + 1);
 491                                return 1;
 492                        }
 493                }
 494        }
 495
 496        bl = bmap_nr * (sb->s_blocksize << 3) + bit_nr;
 497        /* is it in any old transactions? */
 498        if (search_all
 499            && (cn =
 500                get_journal_hash_dev(sb, journal->j_list_hash_table, bl))) {
 501                return 1;
 502        }
 503
 504        /* is it in the current transaction.  This should never happen */
 505        if ((cn = get_journal_hash_dev(sb, journal->j_hash_table, bl))) {
 506                BUG();
 507                return 1;
 508        }
 509
 510        PROC_INFO_INC(sb, journal.in_journal_reusable);
 511        /* safe for reuse */
 512        return 0;
 513}
 514
 515/* insert cn into table
 516*/
 517static inline void insert_journal_hash(struct reiserfs_journal_cnode **table,
 518                                       struct reiserfs_journal_cnode *cn)
 519{
 520        struct reiserfs_journal_cnode *cn_orig;
 521
 522        cn_orig = journal_hash(table, cn->sb, cn->blocknr);
 523        cn->hnext = cn_orig;
 524        cn->hprev = NULL;
 525        if (cn_orig) {
 526                cn_orig->hprev = cn;
 527        }
 528        journal_hash(table, cn->sb, cn->blocknr) = cn;
 529}
 530
 531/* lock the current transaction */
 532static inline void lock_journal(struct super_block *sb)
 533{
 534        PROC_INFO_INC(sb, journal.lock_journal);
 535
 536        reiserfs_mutex_lock_safe(&SB_JOURNAL(sb)->j_mutex, sb);
 537}
 538
 539/* unlock the current transaction */
 540static inline void unlock_journal(struct super_block *sb)
 541{
 542        mutex_unlock(&SB_JOURNAL(sb)->j_mutex);
 543}
 544
 545static inline void get_journal_list(struct reiserfs_journal_list *jl)
 546{
 547        jl->j_refcount++;
 548}
 549
 550static inline void put_journal_list(struct super_block *s,
 551                                    struct reiserfs_journal_list *jl)
 552{
 553        if (jl->j_refcount < 1) {
 554                reiserfs_panic(s, "journal-2", "trans id %u, refcount at %d",
 555                               jl->j_trans_id, jl->j_refcount);
 556        }
 557        if (--jl->j_refcount == 0)
 558                kfree(jl);
 559}
 560
 561/*
 562** this used to be much more involved, and I'm keeping it just in case things get ugly again.
 563** it gets called by flush_commit_list, and cleans up any data stored about blocks freed during a
 564** transaction.
 565*/
 566static void cleanup_freed_for_journal_list(struct super_block *sb,
 567                                           struct reiserfs_journal_list *jl)
 568{
 569
 570        struct reiserfs_list_bitmap *jb = jl->j_list_bitmap;
 571        if (jb) {
 572                cleanup_bitmap_list(sb, jb);
 573        }
 574        jl->j_list_bitmap->journal_list = NULL;
 575        jl->j_list_bitmap = NULL;
 576}
 577
 578static int journal_list_still_alive(struct super_block *s,
 579                                    unsigned int trans_id)
 580{
 581        struct reiserfs_journal *journal = SB_JOURNAL(s);
 582        struct list_head *entry = &journal->j_journal_list;
 583        struct reiserfs_journal_list *jl;
 584
 585        if (!list_empty(entry)) {
 586                jl = JOURNAL_LIST_ENTRY(entry->next);
 587                if (jl->j_trans_id <= trans_id) {
 588                        return 1;
 589                }
 590        }
 591        return 0;
 592}
 593
 594/*
 595 * If page->mapping was null, we failed to truncate this page for
 596 * some reason.  Most likely because it was truncated after being
 597 * logged via data=journal.
 598 *
 599 * This does a check to see if the buffer belongs to one of these
 600 * lost pages before doing the final put_bh.  If page->mapping was
 601 * null, it tries to free buffers on the page, which should make the
 602 * final page_cache_release drop the page from the lru.
 603 */
 604static void release_buffer_page(struct buffer_head *bh)
 605{
 606        struct page *page = bh->b_page;
 607        if (!page->mapping && trylock_page(page)) {
 608                page_cache_get(page);
 609                put_bh(bh);
 610                if (!page->mapping)
 611                        try_to_free_buffers(page);
 612                unlock_page(page);
 613                page_cache_release(page);
 614        } else {
 615                put_bh(bh);
 616        }
 617}
 618
 619static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 620{
 621        char b[BDEVNAME_SIZE];
 622
 623        if (buffer_journaled(bh)) {
 624                reiserfs_warning(NULL, "clm-2084",
 625                                 "pinned buffer %lu:%s sent to disk",
 626                                 bh->b_blocknr, bdevname(bh->b_bdev, b));
 627        }
 628        if (uptodate)
 629                set_buffer_uptodate(bh);
 630        else
 631                clear_buffer_uptodate(bh);
 632
 633        unlock_buffer(bh);
 634        release_buffer_page(bh);
 635}
 636
 637static void reiserfs_end_ordered_io(struct buffer_head *bh, int uptodate)
 638{
 639        if (uptodate)
 640                set_buffer_uptodate(bh);
 641        else
 642                clear_buffer_uptodate(bh);
 643        unlock_buffer(bh);
 644        put_bh(bh);
 645}
 646
 647static void submit_logged_buffer(struct buffer_head *bh)
 648{
 649        get_bh(bh);
 650        bh->b_end_io = reiserfs_end_buffer_io_sync;
 651        clear_buffer_journal_new(bh);
 652        clear_buffer_dirty(bh);
 653        if (!test_clear_buffer_journal_test(bh))
 654                BUG();
 655        if (!buffer_uptodate(bh))
 656                BUG();
 657        submit_bh(WRITE, bh);
 658}
 659
 660static void submit_ordered_buffer(struct buffer_head *bh)
 661{
 662        get_bh(bh);
 663        bh->b_end_io = reiserfs_end_ordered_io;
 664        clear_buffer_dirty(bh);
 665        if (!buffer_uptodate(bh))
 666                BUG();
 667        submit_bh(WRITE, bh);
 668}
 669
 670#define CHUNK_SIZE 32
 671struct buffer_chunk {
 672        struct buffer_head *bh[CHUNK_SIZE];
 673        int nr;
 674};
 675
 676static void write_chunk(struct buffer_chunk *chunk)
 677{
 678        int i;
 679        for (i = 0; i < chunk->nr; i++) {
 680                submit_logged_buffer(chunk->bh[i]);
 681        }
 682        chunk->nr = 0;
 683}
 684
 685static void write_ordered_chunk(struct buffer_chunk *chunk)
 686{
 687        int i;
 688        for (i = 0; i < chunk->nr; i++) {
 689                submit_ordered_buffer(chunk->bh[i]);
 690        }
 691        chunk->nr = 0;
 692}
 693
 694static int add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh,
 695                        spinlock_t * lock, void (fn) (struct buffer_chunk *))
 696{
 697        int ret = 0;
 698        BUG_ON(chunk->nr >= CHUNK_SIZE);
 699        chunk->bh[chunk->nr++] = bh;
 700        if (chunk->nr >= CHUNK_SIZE) {
 701                ret = 1;
 702                if (lock)
 703                        spin_unlock(lock);
 704                fn(chunk);
 705                if (lock)
 706                        spin_lock(lock);
 707        }
 708        return ret;
 709}
 710
 711static atomic_t nr_reiserfs_jh = ATOMIC_INIT(0);
 712static struct reiserfs_jh *alloc_jh(void)
 713{
 714        struct reiserfs_jh *jh;
 715        while (1) {
 716                jh = kmalloc(sizeof(*jh), GFP_NOFS);
 717                if (jh) {
 718                        atomic_inc(&nr_reiserfs_jh);
 719                        return jh;
 720                }
 721                yield();
 722        }
 723}
 724
 725/*
 726 * we want to free the jh when the buffer has been written
 727 * and waited on
 728 */
 729void reiserfs_free_jh(struct buffer_head *bh)
 730{
 731        struct reiserfs_jh *jh;
 732
 733        jh = bh->b_private;
 734        if (jh) {
 735                bh->b_private = NULL;
 736                jh->bh = NULL;
 737                list_del_init(&jh->list);
 738                kfree(jh);
 739                if (atomic_read(&nr_reiserfs_jh) <= 0)
 740                        BUG();
 741                atomic_dec(&nr_reiserfs_jh);
 742                put_bh(bh);
 743        }
 744}
 745
 746static inline int __add_jh(struct reiserfs_journal *j, struct buffer_head *bh,
 747                           int tail)
 748{
 749        struct reiserfs_jh *jh;
 750
 751        if (bh->b_private) {
 752                spin_lock(&j->j_dirty_buffers_lock);
 753                if (!bh->b_private) {
 754                        spin_unlock(&j->j_dirty_buffers_lock);
 755                        goto no_jh;
 756                }
 757                jh = bh->b_private;
 758                list_del_init(&jh->list);
 759        } else {
 760              no_jh:
 761                get_bh(bh);
 762                jh = alloc_jh();
 763                spin_lock(&j->j_dirty_buffers_lock);
 764                /* buffer must be locked for __add_jh, should be able to have
 765                 * two adds at the same time
 766                 */
 767                BUG_ON(bh->b_private);
 768                jh->bh = bh;
 769                bh->b_private = jh;
 770        }
 771        jh->jl = j->j_current_jl;
 772        if (tail)
 773                list_add_tail(&jh->list, &jh->jl->j_tail_bh_list);
 774        else {
 775                list_add_tail(&jh->list, &jh->jl->j_bh_list);
 776        }
 777        spin_unlock(&j->j_dirty_buffers_lock);
 778        return 0;
 779}
 780
 781int reiserfs_add_tail_list(struct inode *inode, struct buffer_head *bh)
 782{
 783        return __add_jh(SB_JOURNAL(inode->i_sb), bh, 1);
 784}
 785int reiserfs_add_ordered_list(struct inode *inode, struct buffer_head *bh)
 786{
 787        return __add_jh(SB_JOURNAL(inode->i_sb), bh, 0);
 788}
 789
 790#define JH_ENTRY(l) list_entry((l), struct reiserfs_jh, list)
 791static int write_ordered_buffers(spinlock_t * lock,
 792                                 struct reiserfs_journal *j,
 793                                 struct reiserfs_journal_list *jl,
 794                                 struct list_head *list)
 795{
 796        struct buffer_head *bh;
 797        struct reiserfs_jh *jh;
 798        int ret = j->j_errno;
 799        struct buffer_chunk chunk;
 800        struct list_head tmp;
 801        INIT_LIST_HEAD(&tmp);
 802
 803        chunk.nr = 0;
 804        spin_lock(lock);
 805        while (!list_empty(list)) {
 806                jh = JH_ENTRY(list->next);
 807                bh = jh->bh;
 808                get_bh(bh);
 809                if (!trylock_buffer(bh)) {
 810                        if (!buffer_dirty(bh)) {
 811                                list_move(&jh->list, &tmp);
 812                                goto loop_next;
 813                        }
 814                        spin_unlock(lock);
 815                        if (chunk.nr)
 816                                write_ordered_chunk(&chunk);
 817                        wait_on_buffer(bh);
 818                        cond_resched();
 819                        spin_lock(lock);
 820                        goto loop_next;
 821                }
 822                /* in theory, dirty non-uptodate buffers should never get here,
 823                 * but the upper layer io error paths still have a few quirks.
 824                 * Handle them here as gracefully as we can
 825                 */
 826                if (!buffer_uptodate(bh) && buffer_dirty(bh)) {
 827                        clear_buffer_dirty(bh);
 828                        ret = -EIO;
 829                }
 830                if (buffer_dirty(bh)) {
 831                        list_move(&jh->list, &tmp);
 832                        add_to_chunk(&chunk, bh, lock, write_ordered_chunk);
 833                } else {
 834                        reiserfs_free_jh(bh);
 835                        unlock_buffer(bh);
 836                }
 837              loop_next:
 838                put_bh(bh);
 839                cond_resched_lock(lock);
 840        }
 841        if (chunk.nr) {
 842                spin_unlock(lock);
 843                write_ordered_chunk(&chunk);
 844                spin_lock(lock);
 845        }
 846        while (!list_empty(&tmp)) {
 847                jh = JH_ENTRY(tmp.prev);
 848                bh = jh->bh;
 849                get_bh(bh);
 850                reiserfs_free_jh(bh);
 851
 852                if (buffer_locked(bh)) {
 853                        spin_unlock(lock);
 854                        wait_on_buffer(bh);
 855                        spin_lock(lock);
 856                }
 857                if (!buffer_uptodate(bh)) {
 858                        ret = -EIO;
 859                }
 860                /* ugly interaction with invalidatepage here.
 861                 * reiserfs_invalidate_page will pin any buffer that has a valid
 862                 * journal head from an older transaction.  If someone else sets
 863                 * our buffer dirty after we write it in the first loop, and
 864                 * then someone truncates the page away, nobody will ever write
 865                 * the buffer. We're safe if we write the page one last time
 866                 * after freeing the journal header.
 867                 */
 868                if (buffer_dirty(bh) && unlikely(bh->b_page->mapping == NULL)) {
 869                        spin_unlock(lock);
 870                        ll_rw_block(WRITE, 1, &bh);
 871                        spin_lock(lock);
 872                }
 873                put_bh(bh);
 874                cond_resched_lock(lock);
 875        }
 876        spin_unlock(lock);
 877        return ret;
 878}
 879
 880static int flush_older_commits(struct super_block *s,
 881                               struct reiserfs_journal_list *jl)
 882{
 883        struct reiserfs_journal *journal = SB_JOURNAL(s);
 884        struct reiserfs_journal_list *other_jl;
 885        struct reiserfs_journal_list *first_jl;
 886        struct list_head *entry;
 887        unsigned int trans_id = jl->j_trans_id;
 888        unsigned int other_trans_id;
 889        unsigned int first_trans_id;
 890
 891      find_first:
 892        /*
 893         * first we walk backwards to find the oldest uncommitted transation
 894         */
 895        first_jl = jl;
 896        entry = jl->j_list.prev;
 897        while (1) {
 898                other_jl = JOURNAL_LIST_ENTRY(entry);
 899                if (entry == &journal->j_journal_list ||
 900                    atomic_read(&other_jl->j_older_commits_done))
 901                        break;
 902
 903                first_jl = other_jl;
 904                entry = other_jl->j_list.prev;
 905        }
 906
 907        /* if we didn't find any older uncommitted transactions, return now */
 908        if (first_jl == jl) {
 909                return 0;
 910        }
 911
 912        first_trans_id = first_jl->j_trans_id;
 913
 914        entry = &first_jl->j_list;
 915        while (1) {
 916                other_jl = JOURNAL_LIST_ENTRY(entry);
 917                other_trans_id = other_jl->j_trans_id;
 918
 919                if (other_trans_id < trans_id) {
 920                        if (atomic_read(&other_jl->j_commit_left) != 0) {
 921                                flush_commit_list(s, other_jl, 0);
 922
 923                                /* list we were called with is gone, return */
 924                                if (!journal_list_still_alive(s, trans_id))
 925                                        return 1;
 926
 927                                /* the one we just flushed is gone, this means all
 928                                 * older lists are also gone, so first_jl is no longer
 929                                 * valid either.  Go back to the beginning.
 930                                 */
 931                                if (!journal_list_still_alive
 932                                    (s, other_trans_id)) {
 933                                        goto find_first;
 934                                }
 935                        }
 936                        entry = entry->next;
 937                        if (entry == &journal->j_journal_list)
 938                                return 0;
 939                } else {
 940                        return 0;
 941                }
 942        }
 943        return 0;
 944}
 945
 946static int reiserfs_async_progress_wait(struct super_block *s)
 947{
 948        struct reiserfs_journal *j = SB_JOURNAL(s);
 949
 950        if (atomic_read(&j->j_async_throttle)) {
 951                reiserfs_write_unlock(s);
 952                congestion_wait(BLK_RW_ASYNC, HZ / 10);
 953                reiserfs_write_lock(s);
 954        }
 955
 956        return 0;
 957}
 958
 959/*
 960** if this journal list still has commit blocks unflushed, send them to disk.
 961**
 962** log areas must be flushed in order (transaction 2 can't commit before transaction 1)
 963** Before the commit block can by written, every other log block must be safely on disk
 964**
 965*/
 966static int flush_commit_list(struct super_block *s,
 967                             struct reiserfs_journal_list *jl, int flushall)
 968{
 969        int i;
 970        b_blocknr_t bn;
 971        struct buffer_head *tbh = NULL;
 972        unsigned int trans_id = jl->j_trans_id;
 973        struct reiserfs_journal *journal = SB_JOURNAL(s);
 974        int retval = 0;
 975        int write_len;
 976
 977        reiserfs_check_lock_depth(s, "flush_commit_list");
 978
 979        if (atomic_read(&jl->j_older_commits_done)) {
 980                return 0;
 981        }
 982
 983        /* before we can put our commit blocks on disk, we have to make sure everyone older than
 984         ** us is on disk too
 985         */
 986        BUG_ON(jl->j_len <= 0);
 987        BUG_ON(trans_id == journal->j_trans_id);
 988
 989        get_journal_list(jl);
 990        if (flushall) {
 991                if (flush_older_commits(s, jl) == 1) {
 992                        /* list disappeared during flush_older_commits.  return */
 993                        goto put_jl;
 994                }
 995        }
 996
 997        /* make sure nobody is trying to flush this one at the same time */
 998        reiserfs_mutex_lock_safe(&jl->j_commit_mutex, s);
 999
1000        if (!journal_list_still_alive(s, trans_id)) {
1001                mutex_unlock(&jl->j_commit_mutex);
1002                goto put_jl;
1003        }
1004        BUG_ON(jl->j_trans_id == 0);
1005
1006        /* this commit is done, exit */
1007        if (atomic_read(&(jl->j_commit_left)) <= 0) {
1008                if (flushall) {
1009                        atomic_set(&(jl->j_older_commits_done), 1);
1010                }
1011                mutex_unlock(&jl->j_commit_mutex);
1012                goto put_jl;
1013        }
1014
1015        if (!list_empty(&jl->j_bh_list)) {
1016                int ret;
1017
1018                /*
1019                 * We might sleep in numerous places inside
1020                 * write_ordered_buffers. Relax the write lock.
1021                 */
1022                reiserfs_write_unlock(s);
1023                ret = write_ordered_buffers(&journal->j_dirty_buffers_lock,
1024                                            journal, jl, &jl->j_bh_list);
1025                if (ret < 0 && retval == 0)
1026                        retval = ret;
1027                reiserfs_write_lock(s);
1028        }
1029        BUG_ON(!list_empty(&jl->j_bh_list));
1030        /*
1031         * for the description block and all the log blocks, submit any buffers
1032         * that haven't already reached the disk.  Try to write at least 256
1033         * log blocks. later on, we will only wait on blocks that correspond
1034         * to this transaction, but while we're unplugging we might as well
1035         * get a chunk of data on there.
1036         */
1037        atomic_inc(&journal->j_async_throttle);
1038        write_len = jl->j_len + 1;
1039        if (write_len < 256)
1040                write_len = 256;
1041        for (i = 0 ; i < write_len ; i++) {
1042                bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start + i) %
1043                    SB_ONDISK_JOURNAL_SIZE(s);
1044                tbh = journal_find_get_block(s, bn);
1045                if (tbh) {
1046                        if (buffer_dirty(tbh)) {
1047                            reiserfs_write_unlock(s);
1048                            ll_rw_block(WRITE, 1, &tbh);
1049                            reiserfs_write_lock(s);
1050                        }
1051                        put_bh(tbh) ;
1052                }
1053        }
1054        atomic_dec(&journal->j_async_throttle);
1055
1056        for (i = 0; i < (jl->j_len + 1); i++) {
1057                bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) +
1058                    (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s);
1059                tbh = journal_find_get_block(s, bn);
1060
1061                reiserfs_write_unlock(s);
1062                wait_on_buffer(tbh);
1063                reiserfs_write_lock(s);
1064                // since we're using ll_rw_blk above, it might have skipped over
1065                // a locked buffer.  Double check here
1066                //
1067                /* redundant, sync_dirty_buffer() checks */
1068                if (buffer_dirty(tbh)) {
1069                        reiserfs_write_unlock(s);
1070                        sync_dirty_buffer(tbh);
1071                        reiserfs_write_lock(s);
1072                }
1073                if (unlikely(!buffer_uptodate(tbh))) {
1074#ifdef CONFIG_REISERFS_CHECK
1075                        reiserfs_warning(s, "journal-601",
1076                                         "buffer write failed");
1077#endif
1078                        retval = -EIO;
1079                }
1080                put_bh(tbh);    /* once for journal_find_get_block */
1081                put_bh(tbh);    /* once due to original getblk in do_journal_end */
1082                atomic_dec(&(jl->j_commit_left));
1083        }
1084
1085        BUG_ON(atomic_read(&(jl->j_commit_left)) != 1);
1086
1087        /* If there was a write error in the journal - we can't commit
1088         * this transaction - it will be invalid and, if successful,
1089         * will just end up propagating the write error out to
1090         * the file system. */
1091        if (likely(!retval && !reiserfs_is_journal_aborted (journal))) {
1092                if (buffer_dirty(jl->j_commit_bh))
1093                        BUG();
1094                mark_buffer_dirty(jl->j_commit_bh) ;
1095                reiserfs_write_unlock(s);
1096                if (reiserfs_barrier_flush(s))
1097                        __sync_dirty_buffer(jl->j_commit_bh, WRITE_FLUSH_FUA);
1098                else
1099                        sync_dirty_buffer(jl->j_commit_bh);
1100                reiserfs_write_lock(s);
1101        }
1102
1103        /* If there was a write error in the journal - we can't commit this
1104         * transaction - it will be invalid and, if successful, will just end
1105         * up propagating the write error out to the filesystem. */
1106        if (unlikely(!buffer_uptodate(jl->j_commit_bh))) {
1107#ifdef CONFIG_REISERFS_CHECK
1108                reiserfs_warning(s, "journal-615", "buffer write failed");
1109#endif
1110                retval = -EIO;
1111        }
1112        bforget(jl->j_commit_bh);
1113        if (journal->j_last_commit_id != 0 &&
1114            (jl->j_trans_id - journal->j_last_commit_id) != 1) {
1115                reiserfs_warning(s, "clm-2200", "last commit %lu, current %lu",
1116                                 journal->j_last_commit_id, jl->j_trans_id);
1117        }
1118        journal->j_last_commit_id = jl->j_trans_id;
1119
1120        /* now, every commit block is on the disk.  It is safe to allow blocks freed during this transaction to be reallocated */
1121        cleanup_freed_for_journal_list(s, jl);
1122
1123        retval = retval ? retval : journal->j_errno;
1124
1125        /* mark the metadata dirty */
1126        if (!retval)
1127                dirty_one_transaction(s, jl);
1128        atomic_dec(&(jl->j_commit_left));
1129
1130        if (flushall) {
1131                atomic_set(&(jl->j_older_commits_done), 1);
1132        }
1133        mutex_unlock(&jl->j_commit_mutex);
1134      put_jl:
1135        put_journal_list(s, jl);
1136
1137        if (retval)
1138                reiserfs_abort(s, retval, "Journal write error in %s",
1139                               __func__);
1140        return retval;
1141}
1142
1143/*
1144** flush_journal_list frequently needs to find a newer transaction for a given block.  This does that, or
1145** returns NULL if it can't find anything
1146*/
1147static struct reiserfs_journal_list *find_newer_jl_for_cn(struct
1148                                                          reiserfs_journal_cnode
1149                                                          *cn)
1150{
1151        struct super_block *sb = cn->sb;
1152        b_blocknr_t blocknr = cn->blocknr;
1153
1154        cn = cn->hprev;
1155        while (cn) {
1156                if (cn->sb == sb && cn->blocknr == blocknr && cn->jlist) {
1157                        return cn->jlist;
1158                }
1159                cn = cn->hprev;
1160        }
1161        return NULL;
1162}
1163
1164static int newer_jl_done(struct reiserfs_journal_cnode *cn)
1165{
1166        struct super_block *sb = cn->sb;
1167        b_blocknr_t blocknr = cn->blocknr;
1168
1169        cn = cn->hprev;
1170        while (cn) {
1171                if (cn->sb == sb && cn->blocknr == blocknr && cn->jlist &&
1172                    atomic_read(&cn->jlist->j_commit_left) != 0)
1173                                    return 0;
1174                cn = cn->hprev;
1175        }
1176        return 1;
1177}
1178
1179static void remove_journal_hash(struct super_block *,
1180                                struct reiserfs_journal_cnode **,
1181                                struct reiserfs_journal_list *, unsigned long,
1182                                int);
1183
1184/*
1185** once all the real blocks have been flushed, it is safe to remove them from the
1186** journal list for this transaction.  Aside from freeing the cnode, this also allows the
1187** block to be reallocated for data blocks if it had been deleted.
1188*/
1189static void remove_all_from_journal_list(struct super_block *sb,
1190                                         struct reiserfs_journal_list *jl,
1191                                         int debug)
1192{
1193        struct reiserfs_journal *journal = SB_JOURNAL(sb);
1194        struct reiserfs_journal_cnode *cn, *last;
1195        cn = jl->j_realblock;
1196
1197        /* which is better, to lock once around the whole loop, or
1198         ** to lock for each call to remove_journal_hash?
1199         */
1200        while (cn) {
1201                if (cn->blocknr != 0) {
1202                        if (debug) {
1203                                reiserfs_warning(sb, "reiserfs-2201",
1204                                                 "block %u, bh is %d, state %ld",
1205                                                 cn->blocknr, cn->bh ? 1 : 0,
1206                                                 cn->state);
1207                        }
1208                        cn->state = 0;
1209                        remove_journal_hash(sb, journal->j_list_hash_table,
1210                                            jl, cn->blocknr, 1);
1211                }
1212                last = cn;
1213                cn = cn->next;
1214                free_cnode(sb, last);
1215        }
1216        jl->j_realblock = NULL;
1217}
1218
1219/*
1220** if this timestamp is greater than the timestamp we wrote last to the header block, write it to the header block.
1221** once this is done, I can safely say the log area for this transaction won't ever be replayed, and I can start
1222** releasing blocks in this transaction for reuse as data blocks.
1223** called by flush_journal_list, before it calls remove_all_from_journal_list
1224**
1225*/
1226static int _update_journal_header_block(struct super_block *sb,
1227                                        unsigned long offset,
1228                                        unsigned int trans_id)
1229{
1230        struct reiserfs_journal_header *jh;
1231        struct reiserfs_journal *journal = SB_JOURNAL(sb);
1232
1233        if (reiserfs_is_journal_aborted(journal))
1234                return -EIO;
1235
1236        if (trans_id >= journal->j_last_flush_trans_id) {
1237                if (buffer_locked((journal->j_header_bh))) {
1238                        reiserfs_write_unlock(sb);
1239                        wait_on_buffer((journal->j_header_bh));
1240                        reiserfs_write_lock(sb);
1241                        if (unlikely(!buffer_uptodate(journal->j_header_bh))) {
1242#ifdef CONFIG_REISERFS_CHECK
1243                                reiserfs_warning(sb, "journal-699",
1244                                                 "buffer write failed");
1245#endif
1246                                return -EIO;
1247                        }
1248                }
1249                journal->j_last_flush_trans_id = trans_id;
1250                journal->j_first_unflushed_offset = offset;
1251                jh = (struct reiserfs_journal_header *)(journal->j_header_bh->
1252                                                        b_data);
1253                jh->j_last_flush_trans_id = cpu_to_le32(trans_id);
1254                jh->j_first_unflushed_offset = cpu_to_le32(offset);
1255                jh->j_mount_id = cpu_to_le32(journal->j_mount_id);
1256
1257                set_buffer_dirty(journal->j_header_bh);
1258                reiserfs_write_unlock(sb);
1259
1260                if (reiserfs_barrier_flush(sb))
1261                        __sync_dirty_buffer(journal->j_header_bh, WRITE_FLUSH_FUA);
1262                else
1263                        sync_dirty_buffer(journal->j_header_bh);
1264
1265                reiserfs_write_lock(sb);
1266                if (!buffer_uptodate(journal->j_header_bh)) {
1267                        reiserfs_warning(sb, "journal-837",
1268                                         "IO error during journal replay");
1269                        return -EIO;
1270                }
1271        }
1272        return 0;
1273}
1274
1275static int update_journal_header_block(struct super_block *sb,
1276                                       unsigned long offset,
1277                                       unsigned int trans_id)
1278{
1279        return _update_journal_header_block(sb, offset, trans_id);
1280}
1281
1282/*
1283** flush any and all journal lists older than you are
1284** can only be called from flush_journal_list
1285*/
1286static int flush_older_journal_lists(struct super_block *sb,
1287                                     struct reiserfs_journal_list *jl)
1288{
1289        struct list_head *entry;
1290        struct reiserfs_journal_list *other_jl;
1291        struct reiserfs_journal *journal = SB_JOURNAL(sb);
1292        unsigned int trans_id = jl->j_trans_id;
1293
1294        /* we know we are the only ones flushing things, no extra race
1295         * protection is required.
1296         */
1297      restart:
1298        entry = journal->j_journal_list.next;
1299        /* Did we wrap? */
1300        if (entry == &journal->j_journal_list)
1301                return 0;
1302        other_jl = JOURNAL_LIST_ENTRY(entry);
1303        if (other_jl->j_trans_id < trans_id) {
1304                BUG_ON(other_jl->j_refcount <= 0);
1305                /* do not flush all */
1306                flush_journal_list(sb, other_jl, 0);
1307
1308                /* other_jl is now deleted from the list */
1309                goto restart;
1310        }
1311        return 0;
1312}
1313
1314static void del_from_work_list(struct super_block *s,
1315                               struct reiserfs_journal_list *jl)
1316{
1317        struct reiserfs_journal *journal = SB_JOURNAL(s);
1318        if (!list_empty(&jl->j_working_list)) {
1319                list_del_init(&jl->j_working_list);
1320                journal->j_num_work_lists--;
1321        }
1322}
1323
1324/* flush a journal list, both commit and real blocks
1325**
1326** always set flushall to 1, unless you are calling from inside
1327** flush_journal_list
1328**
1329** IMPORTANT.  This can only be called while there are no journal writers,
1330** and the journal is locked.  That means it can only be called from
1331** do_journal_end, or by journal_release
1332*/
1333static int flush_journal_list(struct super_block *s,
1334                              struct reiserfs_journal_list *jl, int flushall)
1335{
1336        struct reiserfs_journal_list *pjl;
1337        struct reiserfs_journal_cnode *cn, *last;
1338        int count;
1339        int was_jwait = 0;
1340        int was_dirty = 0;
1341        struct buffer_head *saved_bh;
1342        unsigned long j_len_saved = jl->j_len;
1343        struct reiserfs_journal *journal = SB_JOURNAL(s);
1344        int err = 0;
1345
1346        BUG_ON(j_len_saved <= 0);
1347
1348        if (atomic_read(&journal->j_wcount) != 0) {
1349                reiserfs_warning(s, "clm-2048", "called with wcount %d",
1350                                 atomic_read(&journal->j_wcount));
1351        }
1352        BUG_ON(jl->j_trans_id == 0);
1353
1354        /* if flushall == 0, the lock is already held */
1355        if (flushall) {
1356                reiserfs_mutex_lock_safe(&journal->j_flush_mutex, s);
1357        } else if (mutex_trylock(&journal->j_flush_mutex)) {
1358                BUG();
1359        }
1360
1361        count = 0;
1362        if (j_len_saved > journal->j_trans_max) {
1363                reiserfs_panic(s, "journal-715", "length is %lu, trans id %lu",
1364                               j_len_saved, jl->j_trans_id);
1365                return 0;
1366        }
1367
1368        /* if all the work is already done, get out of here */
1369        if (atomic_read(&(jl->j_nonzerolen)) <= 0 &&
1370            atomic_read(&(jl->j_commit_left)) <= 0) {
1371                goto flush_older_and_return;
1372        }
1373
1374        /* start by putting the commit list on disk.  This will also flush
1375         ** the commit lists of any olders transactions
1376         */
1377        flush_commit_list(s, jl, 1);
1378
1379        if (!(jl->j_state & LIST_DIRTY)
1380            && !reiserfs_is_journal_aborted(journal))
1381                BUG();
1382
1383        /* are we done now? */
1384        if (atomic_read(&(jl->j_nonzerolen)) <= 0 &&
1385            atomic_read(&(jl->j_commit_left)) <= 0) {
1386                goto flush_older_and_return;
1387        }
1388
1389        /* loop through each cnode, see if we need to write it,
1390         ** or wait on a more recent transaction, or just ignore it
1391         */
1392        if (atomic_read(&(journal->j_wcount)) != 0) {
1393                reiserfs_panic(s, "journal-844", "journal list is flushing, "
1394                               "wcount is not 0");
1395        }
1396        cn = jl->j_realblock;
1397        while (cn) {
1398                was_jwait = 0;
1399                was_dirty = 0;
1400                saved_bh = NULL;
1401                /* blocknr of 0 is no longer in the hash, ignore it */
1402                if (cn->blocknr == 0) {
1403                        goto free_cnode;
1404                }
1405
1406                /* This transaction failed commit. Don't write out to the disk */
1407                if (!(jl->j_state & LIST_DIRTY))
1408                        goto free_cnode;
1409
1410                pjl = find_newer_jl_for_cn(cn);
1411                /* the order is important here.  We check pjl to make sure we
1412                 ** don't clear BH_JDirty_wait if we aren't the one writing this
1413                 ** block to disk
1414                 */
1415                if (!pjl && cn->bh) {
1416                        saved_bh = cn->bh;
1417
1418                        /* we do this to make sure nobody releases the buffer while
1419                         ** we are working with it
1420                         */
1421                        get_bh(saved_bh);
1422
1423                        if (buffer_journal_dirty(saved_bh)) {
1424                                BUG_ON(!can_dirty(cn));
1425                                was_jwait = 1;
1426                                was_dirty = 1;
1427                        } else if (can_dirty(cn)) {
1428                                /* everything with !pjl && jwait should be writable */
1429                                BUG();
1430                        }
1431                }
1432
1433                /* if someone has this block in a newer transaction, just make
1434                 ** sure they are committed, and don't try writing it to disk
1435                 */
1436                if (pjl) {
1437                        if (atomic_read(&pjl->j_commit_left))
1438                                flush_commit_list(s, pjl, 1);
1439                        goto free_cnode;
1440                }
1441
1442                /* bh == NULL when the block got to disk on its own, OR,
1443                 ** the block got freed in a future transaction
1444                 */
1445                if (saved_bh == NULL) {
1446                        goto free_cnode;
1447                }
1448
1449                /* this should never happen.  kupdate_one_transaction has this list
1450                 ** locked while it works, so we should never see a buffer here that
1451                 ** is not marked JDirty_wait
1452                 */
1453                if ((!was_jwait) && !buffer_locked(saved_bh)) {
1454                        reiserfs_warning(s, "journal-813",
1455                                         "BAD! buffer %llu %cdirty %cjwait, "
1456                                         "not in a newer tranasction",
1457                                         (unsigned long long)saved_bh->
1458                                         b_blocknr, was_dirty ? ' ' : '!',
1459                                         was_jwait ? ' ' : '!');
1460                }
1461                if (was_dirty) {
1462                        /* we inc again because saved_bh gets decremented at free_cnode */
1463                        get_bh(saved_bh);
1464                        set_bit(BLOCK_NEEDS_FLUSH, &cn->state);
1465                        lock_buffer(saved_bh);
1466                        BUG_ON(cn->blocknr != saved_bh->b_blocknr);
1467                        if (buffer_dirty(saved_bh))
1468                                submit_logged_buffer(saved_bh);
1469                        else
1470                                unlock_buffer(saved_bh);
1471                        count++;
1472                } else {
1473                        reiserfs_warning(s, "clm-2082",
1474                                         "Unable to flush buffer %llu in %s",
1475                                         (unsigned long long)saved_bh->
1476                                         b_blocknr, __func__);
1477                }
1478              free_cnode:
1479                last = cn;
1480                cn = cn->next;
1481                if (saved_bh) {
1482                        /* we incremented this to keep others from taking the buffer head away */
1483                        put_bh(saved_bh);
1484                        if (atomic_read(&(saved_bh->b_count)) < 0) {
1485                                reiserfs_warning(s, "journal-945",
1486                                                 "saved_bh->b_count < 0");
1487                        }
1488                }
1489        }
1490        if (count > 0) {
1491                cn = jl->j_realblock;
1492                while (cn) {
1493                        if (test_bit(BLOCK_NEEDS_FLUSH, &cn->state)) {
1494                                if (!cn->bh) {
1495                                        reiserfs_panic(s, "journal-1011",
1496                                                       "cn->bh is NULL");
1497                                }
1498
1499                                reiserfs_write_unlock(s);
1500                                wait_on_buffer(cn->bh);
1501                                reiserfs_write_lock(s);
1502
1503                                if (!cn->bh) {
1504                                        reiserfs_panic(s, "journal-1012",
1505                                                       "cn->bh is NULL");
1506                                }
1507                                if (unlikely(!buffer_uptodate(cn->bh))) {
1508#ifdef CONFIG_REISERFS_CHECK
1509                                        reiserfs_warning(s, "journal-949",
1510                                                         "buffer write failed");
1511#endif
1512                                        err = -EIO;
1513                                }
1514                                /* note, we must clear the JDirty_wait bit after the up to date
1515                                 ** check, otherwise we race against our flushpage routine
1516                                 */
1517                                BUG_ON(!test_clear_buffer_journal_dirty
1518                                       (cn->bh));
1519
1520                                /* drop one ref for us */
1521                                put_bh(cn->bh);
1522                                /* drop one ref for journal_mark_dirty */
1523                                release_buffer_page(cn->bh);
1524                        }
1525                        cn = cn->next;
1526                }
1527        }
1528
1529        if (err)
1530                reiserfs_abort(s, -EIO,
1531                               "Write error while pushing transaction to disk in %s",
1532                               __func__);
1533      flush_older_and_return:
1534
1535        /* before we can update the journal header block, we _must_ flush all
1536         ** real blocks from all older transactions to disk.  This is because
1537         ** once the header block is updated, this transaction will not be
1538         ** replayed after a crash
1539         */
1540        if (flushall) {
1541                flush_older_journal_lists(s, jl);
1542        }
1543
1544        err = journal->j_errno;
1545        /* before we can remove everything from the hash tables for this
1546         ** transaction, we must make sure it can never be replayed
1547         **
1548         ** since we are only called from do_journal_end, we know for sure there
1549         ** are no allocations going on while we are flushing journal lists.  So,
1550         ** we only need to update the journal header block for the last list
1551         ** being flushed
1552         */
1553        if (!err && flushall) {
1554                err =
1555                    update_journal_header_block(s,
1556                                                (jl->j_start + jl->j_len +
1557                                                 2) % SB_ONDISK_JOURNAL_SIZE(s),
1558                                                jl->j_trans_id);
1559                if (err)
1560                        reiserfs_abort(s, -EIO,
1561                                       "Write error while updating journal header in %s",
1562                                       __func__);
1563        }
1564        remove_all_from_journal_list(s, jl, 0);
1565        list_del_init(&jl->j_list);
1566        journal->j_num_lists--;
1567        del_from_work_list(s, jl);
1568
1569        if (journal->j_last_flush_id != 0 &&
1570            (jl->j_trans_id - journal->j_last_flush_id) != 1) {
1571                reiserfs_warning(s, "clm-2201", "last flush %lu, current %lu",
1572                                 journal->j_last_flush_id, jl->j_trans_id);
1573        }
1574        journal->j_last_flush_id = jl->j_trans_id;
1575
1576        /* not strictly required since we are freeing the list, but it should
1577         * help find code using dead lists later on
1578         */
1579        jl->j_len = 0;
1580        atomic_set(&(jl->j_nonzerolen), 0);
1581        jl->j_start = 0;
1582        jl->j_realblock = NULL;
1583        jl->j_commit_bh = NULL;
1584        jl->j_trans_id = 0;
1585        jl->j_state = 0;
1586        put_journal_list(s, jl);
1587        if (flushall)
1588                mutex_unlock(&journal->j_flush_mutex);
1589        return err;
1590}
1591
1592static int test_transaction(struct super_block *s,
1593                            struct reiserfs_journal_list *jl)
1594{
1595        struct reiserfs_journal_cnode *cn;
1596
1597        if (jl->j_len == 0 || atomic_read(&jl->j_nonzerolen) == 0)
1598                return 1;
1599
1600        cn = jl->j_realblock;
1601        while (cn) {
1602                /* if the blocknr == 0, this has been cleared from the hash,
1603                 ** skip it
1604                 */
1605                if (cn->blocknr == 0) {
1606                        goto next;
1607                }
1608                if (cn->bh && !newer_jl_done(cn))
1609                        return 0;
1610              next:
1611                cn = cn->next;
1612                cond_resched();
1613        }
1614        return 0;
1615}
1616
1617static int write_one_transaction(struct super_block *s,
1618                                 struct reiserfs_journal_list *jl,
1619                                 struct buffer_chunk *chunk)
1620{
1621        struct reiserfs_journal_cnode *cn;
1622        int ret = 0;
1623
1624        jl->j_state |= LIST_TOUCHED;
1625        del_from_work_list(s, jl);
1626        if (jl->j_len == 0 || atomic_read(&jl->j_nonzerolen) == 0) {
1627                return 0;
1628        }
1629
1630        cn = jl->j_realblock;
1631        while (cn) {
1632                /* if the blocknr == 0, this has been cleared from the hash,
1633                 ** skip it
1634                 */
1635                if (cn->blocknr == 0) {
1636                        goto next;
1637                }
1638                if (cn->bh && can_dirty(cn) && buffer_dirty(cn->bh)) {
1639                        struct buffer_head *tmp_bh;
1640                        /* we can race against journal_mark_freed when we try
1641                         * to lock_buffer(cn->bh), so we have to inc the buffer
1642                         * count, and recheck things after locking
1643                         */
1644                        tmp_bh = cn->bh;
1645                        get_bh(tmp_bh);
1646                        lock_buffer(tmp_bh);
1647                        if (cn->bh && can_dirty(cn) && buffer_dirty(tmp_bh)) {
1648                                if (!buffer_journal_dirty(tmp_bh) ||
1649                                    buffer_journal_prepared(tmp_bh))
1650                                        BUG();
1651                                add_to_chunk(chunk, tmp_bh, NULL, write_chunk);
1652                                ret++;
1653                        } else {
1654                                /* note, cn->bh might be null now */
1655                                unlock_buffer(tmp_bh);
1656                        }
1657                        put_bh(tmp_bh);
1658                }
1659              next:
1660                cn = cn->next;
1661                cond_resched();
1662        }
1663        return ret;
1664}
1665
1666/* used by flush_commit_list */
1667static int dirty_one_transaction(struct super_block *s,
1668                                 struct reiserfs_journal_list *jl)
1669{
1670        struct reiserfs_journal_cnode *cn;
1671        struct reiserfs_journal_list *pjl;
1672        int ret = 0;
1673
1674        jl->j_state |= LIST_DIRTY;
1675        cn = jl->j_realblock;
1676        while (cn) {
1677                /* look for a more recent transaction that logged this
1678                 ** buffer.  Only the most recent transaction with a buffer in
1679                 ** it is allowed to send that buffer to disk
1680                 */
1681                pjl = find_newer_jl_for_cn(cn);
1682                if (!pjl && cn->blocknr && cn->bh
1683                    && buffer_journal_dirty(cn->bh)) {
1684                        BUG_ON(!can_dirty(cn));
1685                        /* if the buffer is prepared, it will either be logged
1686                         * or restored.  If restored, we need to make sure
1687                         * it actually gets marked dirty
1688                         */
1689                        clear_buffer_journal_new(cn->bh);
1690                        if (buffer_journal_prepared(cn->bh)) {
1691                                set_buffer_journal_restore_dirty(cn->bh);
1692                        } else {
1693                                set_buffer_journal_test(cn->bh);
1694                                mark_buffer_dirty(cn->bh);
1695                        }
1696                }
1697                cn = cn->next;
1698        }
1699        return ret;
1700}
1701
1702static int kupdate_transactions(struct super_block *s,
1703                                struct reiserfs_journal_list *jl,
1704                                struct reiserfs_journal_list **next_jl,
1705                                unsigned int *next_trans_id,
1706                                int num_blocks, int num_trans)
1707{
1708        int ret = 0;
1709        int written = 0;
1710        int transactions_flushed = 0;
1711        unsigned int orig_trans_id = jl->j_trans_id;
1712        struct buffer_chunk chunk;
1713        struct list_head *entry;
1714        struct reiserfs_journal *journal = SB_JOURNAL(s);
1715        chunk.nr = 0;
1716
1717        reiserfs_mutex_lock_safe(&journal->j_flush_mutex, s);
1718        if (!journal_list_still_alive(s, orig_trans_id)) {
1719                goto done;
1720        }
1721
1722        /* we've got j_flush_mutex held, nobody is going to delete any
1723         * of these lists out from underneath us
1724         */
1725        while ((num_trans && transactions_flushed < num_trans) ||
1726               (!num_trans && written < num_blocks)) {
1727
1728                if (jl->j_len == 0 || (jl->j_state & LIST_TOUCHED) ||
1729                    atomic_read(&jl->j_commit_left)
1730                    || !(jl->j_state & LIST_DIRTY)) {
1731                        del_from_work_list(s, jl);
1732                        break;
1733                }
1734                ret = write_one_transaction(s, jl, &chunk);
1735
1736                if (ret < 0)
1737                        goto done;
1738                transactions_flushed++;
1739                written += ret;
1740                entry = jl->j_list.next;
1741
1742                /* did we wrap? */
1743                if (entry == &journal->j_journal_list) {
1744                        break;
1745                }
1746                jl = JOURNAL_LIST_ENTRY(entry);
1747
1748                /* don't bother with older transactions */
1749                if (jl->j_trans_id <= orig_trans_id)
1750                        break;
1751        }
1752        if (chunk.nr) {
1753                write_chunk(&chunk);
1754        }
1755
1756      done:
1757        mutex_unlock(&journal->j_flush_mutex);
1758        return ret;
1759}
1760
1761/* for o_sync and fsync heavy applications, they tend to use
1762** all the journa list slots with tiny transactions.  These
1763** trigger lots and lots of calls to update the header block, which
1764** adds seeks and slows things down.
1765**
1766** This function tries to clear out a large chunk of the journal lists
1767** at once, which makes everything faster since only the newest journal
1768** list updates the header block
1769*/
1770static int flush_used_journal_lists(struct super_block *s,
1771                                    struct reiserfs_journal_list *jl)
1772{
1773        unsigned long len = 0;
1774        unsigned long cur_len;
1775        int ret;
1776        int i;
1777        int limit = 256;
1778        struct reiserfs_journal_list *tjl;
1779        struct reiserfs_journal_list *flush_jl;
1780        unsigned int trans_id;
1781        struct reiserfs_journal *journal = SB_JOURNAL(s);
1782
1783        flush_jl = tjl = jl;
1784
1785        /* in data logging mode, try harder to flush a lot of blocks */
1786        if (reiserfs_data_log(s))
1787                limit = 1024;
1788        /* flush for 256 transactions or limit blocks, whichever comes first */
1789        for (i = 0; i < 256 && len < limit; i++) {
1790                if (atomic_read(&tjl->j_commit_left) ||
1791                    tjl->j_trans_id < jl->j_trans_id) {
1792                        break;
1793                }
1794                cur_len = atomic_read(&tjl->j_nonzerolen);
1795                if (cur_len > 0) {
1796                        tjl->j_state &= ~LIST_TOUCHED;
1797                }
1798                len += cur_len;
1799                flush_jl = tjl;
1800                if (tjl->j_list.next == &journal->j_journal_list)
1801                        break;
1802                tjl = JOURNAL_LIST_ENTRY(tjl->j_list.next);
1803        }
1804        /* try to find a group of blocks we can flush across all the
1805         ** transactions, but only bother if we've actually spanned
1806         ** across multiple lists
1807         */
1808        if (flush_jl != jl) {
1809                ret = kupdate_transactions(s, jl, &tjl, &trans_id, len, i);
1810        }
1811        flush_journal_list(s, flush_jl, 1);
1812        return 0;
1813}
1814
1815/*
1816** removes any nodes in table with name block and dev as bh.
1817** only touchs the hnext and hprev pointers.
1818*/
1819void remove_journal_hash(struct super_block *sb,
1820                         struct reiserfs_journal_cnode **table,
1821                         struct reiserfs_journal_list *jl,
1822                         unsigned long block, int remove_freed)
1823{
1824        struct reiserfs_journal_cnode *cur;
1825        struct reiserfs_journal_cnode **head;
1826
1827        head = &(journal_hash(table, sb, block));
1828        if (!head) {
1829                return;
1830        }
1831        cur = *head;
1832        while (cur) {
1833                if (cur->blocknr == block && cur->sb == sb
1834                    && (jl == NULL || jl == cur->jlist)
1835                    && (!test_bit(BLOCK_FREED, &cur->state) || remove_freed)) {
1836                        if (cur->hnext) {
1837                                cur->hnext->hprev = cur->hprev;
1838                        }
1839                        if (cur->hprev) {
1840                                cur->hprev->hnext = cur->hnext;
1841                        } else {
1842                                *head = cur->hnext;
1843                        }
1844                        cur->blocknr = 0;
1845                        cur->sb = NULL;
1846                        cur->state = 0;
1847                        if (cur->bh && cur->jlist)      /* anybody who clears the cur->bh will also dec the nonzerolen */
1848                                atomic_dec(&(cur->jlist->j_nonzerolen));
1849                        cur->bh = NULL;
1850                        cur->jlist = NULL;
1851                }
1852                cur = cur->hnext;
1853        }
1854}
1855
1856static void free_journal_ram(struct super_block *sb)
1857{
1858        struct reiserfs_journal *journal = SB_JOURNAL(sb);
1859        kfree(journal->j_current_jl);
1860        journal->j_num_lists--;
1861
1862        vfree(journal->j_cnode_free_orig);
1863        free_list_bitmaps(sb, journal->j_list_bitmap);
1864        free_bitmap_nodes(sb);  /* must be after free_list_bitmaps */
1865        if (journal->j_header_bh) {
1866                brelse(journal->j_header_bh);
1867        }
1868        /* j_header_bh is on the journal dev, make sure not to release the journal
1869         * dev until we brelse j_header_bh
1870         */
1871        release_journal_dev(sb, journal);
1872        vfree(journal);
1873}
1874
1875/*
1876** call on unmount.  Only set error to 1 if you haven't made your way out
1877** of read_super() yet.  Any other caller must keep error at 0.
1878*/
1879static int do_journal_release(struct reiserfs_transaction_handle *th,
1880                              struct super_block *sb, int error)
1881{
1882        struct reiserfs_transaction_handle myth;
1883        int flushed = 0;
1884        struct reiserfs_journal *journal = SB_JOURNAL(sb);
1885
1886        /* we only want to flush out transactions if we were called with error == 0
1887         */
1888        if (!error && !(sb->s_flags & MS_RDONLY)) {
1889                /* end the current trans */
1890                BUG_ON(!th->t_trans_id);
1891                do_journal_end(th, sb, 10, FLUSH_ALL);
1892
1893                /* make sure something gets logged to force our way into the flush code */
1894                if (!journal_join(&myth, sb, 1)) {
1895                        reiserfs_prepare_for_journal(sb,
1896                                                     SB_BUFFER_WITH_SB(sb),
1897                                                     1);
1898                        journal_mark_dirty(&myth, sb,
1899                                           SB_BUFFER_WITH_SB(sb));
1900                        do_journal_end(&myth, sb, 1, FLUSH_ALL);
1901                        flushed = 1;
1902                }
1903        }
1904
1905        /* this also catches errors during the do_journal_end above */
1906        if (!error && reiserfs_is_journal_aborted(journal)) {
1907                memset(&myth, 0, sizeof(myth));
1908                if (!journal_join_abort(&myth, sb, 1)) {
1909                        reiserfs_prepare_for_journal(sb,
1910                                                     SB_BUFFER_WITH_SB(sb),
1911                                                     1);
1912                        journal_mark_dirty(&myth, sb,
1913                                           SB_BUFFER_WITH_SB(sb));
1914                        do_journal_end(&myth, sb, 1, FLUSH_ALL);
1915                }
1916        }
1917
1918        reiserfs_mounted_fs_count--;
1919        /* wait for all commits to finish */
1920        cancel_delayed_work(&SB_JOURNAL(sb)->j_work);
1921
1922        /*
1923         * We must release the write lock here because
1924         * the workqueue job (flush_async_commit) needs this lock
1925         */
1926        reiserfs_write_unlock(sb);
1927        flush_workqueue(commit_wq);
1928
1929        if (!reiserfs_mounted_fs_count) {
1930                destroy_workqueue(commit_wq);
1931                commit_wq = NULL;
1932        }
1933
1934        free_journal_ram(sb);
1935
1936        reiserfs_write_lock(sb);
1937
1938        return 0;
1939}
1940
1941/*
1942** call on unmount.  flush all journal trans, release all alloc'd ram
1943*/
1944int journal_release(struct reiserfs_transaction_handle *th,
1945                    struct super_block *sb)
1946{
1947        return do_journal_release(th, sb, 0);
1948}
1949
1950/*
1951** only call from an error condition inside reiserfs_read_super!
1952*/
1953int journal_release_error(struct reiserfs_transaction_handle *th,
1954                          struct super_block *sb)
1955{
1956        return do_journal_release(th, sb, 1);
1957}
1958
1959/* compares description block with commit block.  returns 1 if they differ, 0 if they are the same */
1960static int journal_compare_desc_commit(struct super_block *sb,
1961                                       struct reiserfs_journal_desc *desc,
1962                                       struct reiserfs_journal_commit *commit)
1963{
1964        if (get_commit_trans_id(commit) != get_desc_trans_id(desc) ||
1965            get_commit_trans_len(commit) != get_desc_trans_len(desc) ||
1966            get_commit_trans_len(commit) > SB_JOURNAL(sb)->j_trans_max ||
1967            get_commit_trans_len(commit) <= 0) {
1968                return 1;
1969        }
1970        return 0;
1971}
1972
1973/* returns 0 if it did not find a description block
1974** returns -1 if it found a corrupt commit block
1975** returns 1 if both desc and commit were valid
1976*/
1977static int journal_transaction_is_valid(struct super_block *sb,
1978                                        struct buffer_head *d_bh,
1979                                        unsigned int *oldest_invalid_trans_id,
1980                                        unsigned long *newest_mount_id)
1981{
1982        struct reiserfs_journal_desc *desc;
1983        struct reiserfs_journal_commit *commit;
1984        struct buffer_head *c_bh;
1985        unsigned long offset;
1986
1987        if (!d_bh)
1988                return 0;
1989
1990        desc = (struct reiserfs_journal_desc *)d_bh->b_data;
1991        if (get_desc_trans_len(desc) > 0
1992            && !memcmp(get_journal_desc_magic(d_bh), JOURNAL_DESC_MAGIC, 8)) {
1993                if (oldest_invalid_trans_id && *oldest_invalid_trans_id
1994                    && get_desc_trans_id(desc) > *oldest_invalid_trans_id) {
1995                        reiserfs_debug(sb, REISERFS_DEBUG_CODE,
1996                                       "journal-986: transaction "
1997                                       "is valid returning because trans_id %d is greater than "
1998                                       "oldest_invalid %lu",
1999                                       get_desc_trans_id(desc),
2000                                       *oldest_invalid_trans_id);
2001                        return 0;
2002                }
2003                if (newest_mount_id
2004                    && *newest_mount_id > get_desc_mount_id(desc)) {
2005                        reiserfs_debug(sb, REISERFS_DEBUG_CODE,
2006                                       "journal-1087: transaction "
2007                                       "is valid returning because mount_id %d is less than "
2008                                       "newest_mount_id %lu",
2009                                       get_desc_mount_id(desc),
2010                                       *newest_mount_id);
2011                        return -1;
2012                }
2013                if (get_desc_trans_len(desc) > SB_JOURNAL(sb)->j_trans_max) {
2014                        reiserfs_warning(sb, "journal-2018",
2015                                         "Bad transaction length %d "
2016                                         "encountered, ignoring transaction",
2017                                         get_desc_trans_len(desc));
2018                        return -1;
2019                }
2020                offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb);
2021
2022                /* ok, we have a journal description block, lets see if the transaction was valid */
2023                c_bh =
2024                    journal_bread(sb,
2025                                  SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
2026                                  ((offset + get_desc_trans_len(desc) +
2027                                    1) % SB_ONDISK_JOURNAL_SIZE(sb)));
2028                if (!c_bh)
2029                        return 0;
2030                commit = (struct reiserfs_journal_commit *)c_bh->b_data;
2031                if (journal_compare_desc_commit(sb, desc, commit)) {
2032                        reiserfs_debug(sb, REISERFS_DEBUG_CODE,
2033                                       "journal_transaction_is_valid, commit offset %ld had bad "
2034                                       "time %d or length %d",
2035                                       c_bh->b_blocknr -
2036                                       SB_ONDISK_JOURNAL_1st_BLOCK(sb),
2037                                       get_commit_trans_id(commit),
2038                                       get_commit_trans_len(commit));
2039                        brelse(c_bh);
2040                        if (oldest_invalid_trans_id) {
2041                                *oldest_invalid_trans_id =
2042                                    get_desc_trans_id(desc);
2043                                reiserfs_debug(sb, REISERFS_DEBUG_CODE,
2044                                               "journal-1004: "
2045                                               "transaction_is_valid setting oldest invalid trans_id "
2046                                               "to %d",
2047                                               get_desc_trans_id(desc));
2048                        }
2049                        return -1;
2050                }
2051                brelse(c_bh);
2052                reiserfs_debug(sb, REISERFS_DEBUG_CODE,
2053                               "journal-1006: found valid "
2054                               "transaction start offset %llu, len %d id %d",
2055                               d_bh->b_blocknr -
2056                               SB_ONDISK_JOURNAL_1st_BLOCK(sb),
2057                               get_desc_trans_len(desc),
2058                               get_desc_trans_id(desc));
2059                return 1;
2060        } else {
2061                return 0;
2062        }
2063}
2064
2065static void brelse_array(struct buffer_head **heads, int num)
2066{
2067        int i;
2068        for (i = 0; i < num; i++) {
2069                brelse(heads[i]);
2070        }
2071}
2072
2073/*
2074** given the start, and values for the oldest acceptable transactions,
2075** this either reads in a replays a transaction, or returns because the transaction
2076** is invalid, or too old.
2077*/
2078static int journal_read_transaction(struct super_block *sb,
2079                                    unsigned long cur_dblock,
2080                                    unsigned long oldest_start,
2081                                    unsigned int oldest_trans_id,
2082                                    unsigned long newest_mount_id)
2083{
2084        struct reiserfs_journal *journal = SB_JOURNAL(sb);
2085        struct reiserfs_journal_desc *desc;
2086        struct reiserfs_journal_commit *commit;
2087        unsigned int trans_id = 0;
2088        struct buffer_head *c_bh;
2089        struct buffer_head *d_bh;
2090        struct buffer_head **log_blocks = NULL;
2091        struct buffer_head **real_blocks = NULL;
2092        unsigned int trans_offset;
2093        int i;
2094        int trans_half;
2095
2096        d_bh = journal_bread(sb, cur_dblock);
2097        if (!d_bh)
2098                return 1;
2099        desc = (struct reiserfs_journal_desc *)d_bh->b_data;
2100        trans_offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb);
2101        reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1037: "
2102                       "journal_read_transaction, offset %llu, len %d mount_id %d",
2103                       d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb),
2104                       get_desc_trans_len(desc), get_desc_mount_id(desc));
2105        if (get_desc_trans_id(desc) < oldest_trans_id) {
2106                reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1039: "
2107                               "journal_read_trans skipping because %lu is too old",
2108                               cur_dblock -
2109                               SB_ONDISK_JOURNAL_1st_BLOCK(sb));
2110                brelse(d_bh);
2111                return 1;
2112        }
2113        if (get_desc_mount_id(desc) != newest_mount_id) {
2114                reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1146: "
2115                               "journal_read_trans skipping because %d is != "
2116                               "newest_mount_id %lu", get_desc_mount_id(desc),
2117                               newest_mount_id);
2118                brelse(d_bh);
2119                return 1;
2120        }
2121        c_bh = journal_bread(sb, SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
2122                             ((trans_offset + get_desc_trans_len(desc) + 1) %
2123                              SB_ONDISK_JOURNAL_SIZE(sb)));
2124        if (!c_bh) {
2125                brelse(d_bh);
2126                return 1;
2127        }
2128        commit = (struct reiserfs_journal_commit *)c_bh->b_data;
2129        if (journal_compare_desc_commit(sb, desc, commit)) {
2130                reiserfs_debug(sb, REISERFS_DEBUG_CODE,
2131                               "journal_read_transaction, "
2132                               "commit offset %llu had bad time %d or length %d",
2133                               c_bh->b_blocknr -
2134                               SB_ONDISK_JOURNAL_1st_BLOCK(sb),
2135                               get_commit_trans_id(commit),
2136                               get_commit_trans_len(commit));
2137                brelse(c_bh);
2138                brelse(d_bh);
2139                return 1;
2140        }
2141
2142        if (bdev_read_only(sb->s_bdev)) {
2143                reiserfs_warning(sb, "clm-2076",
2144                                 "device is readonly, unable to replay log");
2145                brelse(c_bh);
2146                brelse(d_bh);
2147                return -EROFS;
2148        }
2149
2150        trans_id = get_desc_trans_id(desc);
2151        /* now we know we've got a good transaction, and it was inside the valid time ranges */
2152        log_blocks = kmalloc(get_desc_trans_len(desc) *
2153                             sizeof(struct buffer_head *), GFP_NOFS);
2154        real_blocks = kmalloc(get_desc_trans_len(desc) *
2155                              sizeof(struct buffer_head *), GFP_NOFS);
2156        if (!log_blocks || !real_blocks) {
2157                brelse(c_bh);
2158                brelse(d_bh);
2159                kfree(log_blocks);
2160                kfree(real_blocks);
2161                reiserfs_warning(sb, "journal-1169",
2162                                 "kmalloc failed, unable to mount FS");
2163                return -1;
2164        }
2165        /* get all the buffer heads */
2166        trans_half = journal_trans_half(sb->s_blocksize);
2167        for (i = 0; i < get_desc_trans_len(desc); i++) {
2168                log_blocks[i] =
2169                    journal_getblk(sb,
2170                                   SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
2171                                   (trans_offset + 1 +
2172                                    i) % SB_ONDISK_JOURNAL_SIZE(sb));
2173                if (i < trans_half) {
2174                        real_blocks[i] =
2175                            sb_getblk(sb,
2176                                      le32_to_cpu(desc->j_realblock[i]));
2177                } else {
2178                        real_blocks[i] =
2179                            sb_getblk(sb,
2180                                      le32_to_cpu(commit->
2181                                                  j_realblock[i - trans_half]));
2182                }
2183                if (real_blocks[i]->b_blocknr > SB_BLOCK_COUNT(sb)) {
2184                        reiserfs_warning(sb, "journal-1207",
2185                                         "REPLAY FAILURE fsck required! "
2186                                         "Block to replay is outside of "
2187                                         "filesystem");
2188                        goto abort_replay;
2189                }
2190                /* make sure we don't try to replay onto log or reserved area */
2191                if (is_block_in_log_or_reserved_area
2192                    (sb, real_blocks[i]->b_blocknr)) {
2193                        reiserfs_warning(sb, "journal-1204",
2194                                         "REPLAY FAILURE fsck required! "
2195                                         "Trying to replay onto a log block");
2196                      abort_replay:
2197                        brelse_array(log_blocks, i);
2198                        brelse_array(real_blocks, i);
2199                        brelse(c_bh);
2200                        brelse(d_bh);
2201                        kfree(log_blocks);
2202                        kfree(real_blocks);
2203                        return -1;
2204                }
2205        }
2206        /* read in the log blocks, memcpy to the corresponding real block */
2207        ll_rw_block(READ, get_desc_trans_len(desc), log_blocks);
2208        for (i = 0; i < get_desc_trans_len(desc); i++) {
2209
2210                reiserfs_write_unlock(sb);
2211                wait_on_buffer(log_blocks[i]);
2212                reiserfs_write_lock(sb);
2213
2214                if (!buffer_uptodate(log_blocks[i])) {
2215                        reiserfs_warning(sb, "journal-1212",
2216                                         "REPLAY FAILURE fsck required! "
2217                                         "buffer write failed");
2218                        brelse_array(log_blocks + i,
2219                                     get_desc_trans_len(desc) - i);
2220                        brelse_array(real_blocks, get_desc_trans_len(desc));
2221                        brelse(c_bh);
2222                        brelse(d_bh);
2223                        kfree(log_blocks);
2224                        kfree(real_blocks);
2225                        return -1;
2226                }
2227                memcpy(real_blocks[i]->b_data, log_blocks[i]->b_data,
2228                       real_blocks[i]->b_size);
2229                set_buffer_uptodate(real_blocks[i]);
2230                brelse(log_blocks[i]);
2231        }
2232        /* flush out the real blocks */
2233        for (i = 0; i < get_desc_trans_len(desc); i++) {
2234                set_buffer_dirty(real_blocks[i]);
2235                write_dirty_buffer(real_blocks[i], WRITE);
2236        }
2237        for (i = 0; i < get_desc_trans_len(desc); i++) {
2238                wait_on_buffer(real_blocks[i]);
2239                if (!buffer_uptodate(real_blocks[i])) {
2240                        reiserfs_warning(sb, "journal-1226",
2241                                         "REPLAY FAILURE, fsck required! "
2242                                         "buffer write failed");
2243                        brelse_array(real_blocks + i,
2244                                     get_desc_trans_len(desc) - i);
2245                        brelse(c_bh);
2246                        brelse(d_bh);
2247                        kfree(log_blocks);
2248                        kfree(real_blocks);
2249                        return -1;
2250                }
2251                brelse(real_blocks[i]);
2252        }
2253        cur_dblock =
2254            SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
2255            ((trans_offset + get_desc_trans_len(desc) +
2256              2) % SB_ONDISK_JOURNAL_SIZE(sb));
2257        reiserfs_debug(sb, REISERFS_DEBUG_CODE,
2258                       "journal-1095: setting journal " "start to offset %ld",
2259                       cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb));
2260
2261        /* init starting values for the first transaction, in case this is the last transaction to be replayed. */
2262        journal->j_start = cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb);
2263        journal->j_last_flush_trans_id = trans_id;
2264        journal->j_trans_id = trans_id + 1;
2265        /* check for trans_id overflow */
2266        if (journal->j_trans_id == 0)
2267                journal->j_trans_id = 10;
2268        brelse(c_bh);
2269        brelse(d_bh);
2270        kfree(log_blocks);
2271        kfree(real_blocks);
2272        return 0;
2273}
2274
2275/* This function reads blocks starting from block and to max_block of bufsize
2276   size (but no more than BUFNR blocks at a time). This proved to improve
2277   mounting speed on self-rebuilding raid5 arrays at least.
2278   Right now it is only used from journal code. But later we might use it
2279   from other places.
2280   Note: Do not use journal_getblk/sb_getblk functions here! */
2281static struct buffer_head *reiserfs_breada(struct block_device *dev,
2282                                           b_blocknr_t block, int bufsize,
2283                                           b_blocknr_t max_block)
2284{
2285        struct buffer_head *bhlist[BUFNR];
2286        unsigned int blocks = BUFNR;
2287        struct buffer_head *bh;
2288        int i, j;
2289
2290        bh = __getblk(dev, block, bufsize);
2291        if (buffer_uptodate(bh))
2292                return (bh);
2293
2294        if (block + BUFNR > max_block) {
2295                blocks = max_block - block;
2296        }
2297        bhlist[0] = bh;
2298        j = 1;
2299        for (i = 1; i < blocks; i++) {
2300                bh = __getblk(dev, block + i, bufsize);
2301                if (buffer_uptodate(bh)) {
2302                        brelse(bh);
2303                        break;
2304                } else
2305                        bhlist[j++] = bh;
2306        }
2307        ll_rw_block(READ, j, bhlist);
2308        for (i = 1; i < j; i++)
2309                brelse(bhlist[i]);
2310        bh = bhlist[0];
2311        wait_on_buffer(bh);
2312        if (buffer_uptodate(bh))
2313                return bh;
2314        brelse(bh);
2315        return NULL;
2316}
2317
2318/*
2319** read and replay the log
2320** on a clean unmount, the journal header's next unflushed pointer will be to an invalid
2321** transaction.  This tests that before finding all the transactions in the log, which makes normal mount times fast.
2322**
2323** After a crash, this starts with the next unflushed transaction, and replays until it finds one too old, or invalid.
2324**
2325** On exit, it sets things up so the first transaction will work correctly.
2326*/
2327static int journal_read(struct super_block *sb)
2328{
2329        struct reiserfs_journal *journal = SB_JOURNAL(sb);
2330        struct reiserfs_journal_desc *desc;
2331        unsigned int oldest_trans_id = 0;
2332        unsigned int oldest_invalid_trans_id = 0;
2333        time_t start;
2334        unsigned long oldest_start = 0;
2335        unsigned long cur_dblock = 0;
2336        unsigned long newest_mount_id = 9;
2337        struct buffer_head *d_bh;
2338        struct reiserfs_journal_header *jh;
2339        int valid_journal_header = 0;
2340        int replay_count = 0;
2341        int continue_replay = 1;
2342        int ret;
2343        char b[BDEVNAME_SIZE];
2344
2345        cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(sb);
2346        reiserfs_info(sb, "checking transaction log (%s)\n",
2347                      bdevname(journal->j_dev_bd, b));
2348        start = get_seconds();
2349
2350        /* step 1, read in the journal header block.  Check the transaction it says
2351         ** is the first unflushed, and if that transaction is not valid,
2352         ** replay is done
2353         */
2354        journal->j_header_bh = journal_bread(sb,
2355                                             SB_ONDISK_JOURNAL_1st_BLOCK(sb)
2356                                             + SB_ONDISK_JOURNAL_SIZE(sb));
2357        if (!journal->j_header_bh) {
2358                return 1;
2359        }
2360        jh = (struct reiserfs_journal_header *)(journal->j_header_bh->b_data);
2361        if (le32_to_cpu(jh->j_first_unflushed_offset) <
2362            SB_ONDISK_JOURNAL_SIZE(sb)
2363            && le32_to_cpu(jh->j_last_flush_trans_id) > 0) {
2364                oldest_start =
2365                    SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
2366                    le32_to_cpu(jh->j_first_unflushed_offset);
2367                oldest_trans_id = le32_to_cpu(jh->j_last_flush_trans_id) + 1;
2368                newest_mount_id = le32_to_cpu(jh->j_mount_id);
2369                reiserfs_debug(sb, REISERFS_DEBUG_CODE,
2370                               "journal-1153: found in "
2371                               "header: first_unflushed_offset %d, last_flushed_trans_id "
2372                               "%lu", le32_to_cpu(jh->j_first_unflushed_offset),
2373                               le32_to_cpu(jh->j_last_flush_trans_id));
2374                valid_journal_header = 1;
2375
2376                /* now, we try to read the first unflushed offset.  If it is not valid,
2377                 ** there is nothing more we can do, and it makes no sense to read
2378                 ** through the whole log.
2379                 */
2380                d_bh =
2381                    journal_bread(sb,
2382                                  SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
2383                                  le32_to_cpu(jh->j_first_unflushed_offset));
2384                ret = journal_transaction_is_valid(sb, d_bh, NULL, NULL);
2385                if (!ret) {
2386                        continue_replay = 0;
2387                }
2388                brelse(d_bh);
2389                goto start_log_replay;
2390        }
2391
2392        /* ok, there are transactions that need to be replayed.  start with the first log block, find
2393         ** all the valid transactions, and pick out the oldest.
2394         */
2395        while (continue_replay
2396               && cur_dblock <
2397               (SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
2398                SB_ONDISK_JOURNAL_SIZE(sb))) {
2399                /* Note that it is required for blocksize of primary fs device and journal
2400                   device to be the same */
2401                d_bh =
2402                    reiserfs_breada(journal->j_dev_bd, cur_dblock,
2403                                    sb->s_blocksize,
2404                                    SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
2405                                    SB_ONDISK_JOURNAL_SIZE(sb));
2406                ret =
2407                    journal_transaction_is_valid(sb, d_bh,
2408                                                 &oldest_invalid_trans_id,
2409                                                 &newest_mount_id);
2410                if (ret == 1) {
2411                        desc = (struct reiserfs_journal_desc *)d_bh->b_data;
2412                        if (oldest_start == 0) {        /* init all oldest_ values */
2413                                oldest_trans_id = get_desc_trans_id(desc);
2414                                oldest_start = d_bh->b_blocknr;
2415                                newest_mount_id = get_desc_mount_id(desc);
2416                                reiserfs_debug(sb, REISERFS_DEBUG_CODE,
2417                                               "journal-1179: Setting "
2418                                               "oldest_start to offset %llu, trans_id %lu",
2419                                               oldest_start -
2420                                               SB_ONDISK_JOURNAL_1st_BLOCK
2421                                               (sb), oldest_trans_id);
2422                        } else if (oldest_trans_id > get_desc_trans_id(desc)) {
2423                                /* one we just read was older */
2424                                oldest_trans_id = get_desc_trans_id(desc);
2425                                oldest_start = d_bh->b_blocknr;
2426                                reiserfs_debug(sb, REISERFS_DEBUG_CODE,
2427                                               "journal-1180: Resetting "
2428                                               "oldest_start to offset %lu, trans_id %lu",
2429                                               oldest_start -
2430                                               SB_ONDISK_JOURNAL_1st_BLOCK
2431                                               (sb), oldest_trans_id);
2432                        }
2433                        if (newest_mount_id < get_desc_mount_id(desc)) {
2434                                newest_mount_id = get_desc_mount_id(desc);
2435                                reiserfs_debug(sb, REISERFS_DEBUG_CODE,
2436                                               "journal-1299: Setting "
2437                                               "newest_mount_id to %d",
2438                                               get_desc_mount_id(desc));
2439                        }
2440                        cur_dblock += get_desc_trans_len(desc) + 2;
2441                } else {
2442                        cur_dblock++;
2443                }
2444                brelse(d_bh);
2445        }
2446
2447      start_log_replay:
2448        cur_dblock = oldest_start;
2449        if (oldest_trans_id) {
2450                reiserfs_debug(sb, REISERFS_DEBUG_CODE,
2451                               "journal-1206: Starting replay "
2452                               "from offset %llu, trans_id %lu",
2453                               cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb),
2454                               oldest_trans_id);
2455
2456        }
2457        replay_count = 0;
2458        while (continue_replay && oldest_trans_id > 0) {
2459                ret =
2460                    journal_read_transaction(sb, cur_dblock, oldest_start,
2461                                             oldest_trans_id, newest_mount_id);
2462                if (ret < 0) {
2463                        return ret;
2464                } else if (ret != 0) {
2465                        break;
2466                }
2467                cur_dblock =
2468                    SB_ONDISK_JOURNAL_1st_BLOCK(sb) + journal->j_start;
2469                replay_count++;
2470                if (cur_dblock == oldest_start)
2471                        break;
2472        }
2473
2474        if (oldest_trans_id == 0) {
2475                reiserfs_debug(sb, REISERFS_DEBUG_CODE,
2476                               "journal-1225: No valid " "transactions found");
2477        }
2478        /* j_start does not get set correctly if we don't replay any transactions.
2479         ** if we had a valid journal_header, set j_start to the first unflushed transaction value,
2480         ** copy the trans_id from the header
2481         */
2482        if (valid_journal_header && replay_count == 0) {
2483                journal->j_start = le32_to_cpu(jh->j_first_unflushed_offset);
2484                journal->j_trans_id =
2485                    le32_to_cpu(jh->j_last_flush_trans_id) + 1;
2486                /* check for trans_id overflow */
2487                if (journal->j_trans_id == 0)
2488                        journal->j_trans_id = 10;
2489                journal->j_last_flush_trans_id =
2490                    le32_to_cpu(jh->j_last_flush_trans_id);
2491                journal->j_mount_id = le32_to_cpu(jh->j_mount_id) + 1;
2492        } else {
2493                journal->j_mount_id = newest_mount_id + 1;
2494        }
2495        reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1299: Setting "
2496                       "newest_mount_id to %lu", journal->j_mount_id);
2497        journal->j_first_unflushed_offset = journal->j_start;
2498        if (replay_count > 0) {
2499                reiserfs_info(sb,
2500                              "replayed %d transactions in %lu seconds\n",
2501                              replay_count, get_seconds() - start);
2502        }
2503        if (!bdev_read_only(sb->s_bdev) &&
2504            _update_journal_header_block(sb, journal->j_start,
2505                                         journal->j_last_flush_trans_id)) {
2506                /* replay failed, caller must call free_journal_ram and abort
2507                 ** the mount
2508                 */
2509                return -1;
2510        }
2511        return 0;
2512}
2513
2514static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s)
2515{
2516        struct reiserfs_journal_list *jl;
2517        jl = kzalloc(sizeof(struct reiserfs_journal_list),
2518                     GFP_NOFS | __GFP_NOFAIL);
2519        INIT_LIST_HEAD(&jl->j_list);
2520        INIT_LIST_HEAD(&jl->j_working_list);
2521        INIT_LIST_HEAD(&jl->j_tail_bh_list);
2522        INIT_LIST_HEAD(&jl->j_bh_list);
2523        mutex_init(&jl->j_commit_mutex);
2524        SB_JOURNAL(s)->j_num_lists++;
2525        get_journal_list(jl);
2526        return jl;
2527}
2528
2529static void journal_list_init(struct super_block *sb)
2530{
2531        SB_JOURNAL(sb)->j_current_jl = alloc_journal_list(sb);
2532}
2533
2534static int release_journal_dev(struct super_block *super,
2535                               struct reiserfs_journal *journal)
2536{
2537        int result;
2538
2539        result = 0;
2540
2541        if (journal->j_dev_bd != NULL) {
2542                result = blkdev_put(journal->j_dev_bd, journal->j_dev_mode);
2543                journal->j_dev_bd = NULL;
2544        }
2545
2546        if (result != 0) {
2547                reiserfs_warning(super, "sh-457",
2548                                 "Cannot release journal device: %i", result);
2549        }
2550        return result;
2551}
2552
2553static int journal_init_dev(struct super_block *super,
2554                            struct reiserfs_journal *journal,
2555                            const char *jdev_name)
2556{
2557        int result;
2558        dev_t jdev;
2559        fmode_t blkdev_mode = FMODE_READ | FMODE_WRITE | FMODE_EXCL;
2560        char b[BDEVNAME_SIZE];
2561
2562        result = 0;
2563
2564        journal->j_dev_bd = NULL;
2565        jdev = SB_ONDISK_JOURNAL_DEVICE(super) ?
2566            new_decode_dev(SB_ONDISK_JOURNAL_DEVICE(super)) : super->s_dev;
2567
2568        if (bdev_read_only(super->s_bdev))
2569                blkdev_mode = FMODE_READ;
2570
2571        /* there is no "jdev" option and journal is on separate device */
2572        if ((!jdev_name || !jdev_name[0])) {
2573                if (jdev == super->s_dev)
2574                        blkdev_mode &= ~FMODE_EXCL;
2575                journal->j_dev_bd = blkdev_get_by_dev(jdev, blkdev_mode,
2576                                                      journal);
2577                journal->j_dev_mode = blkdev_mode;
2578                if (IS_ERR(journal->j_dev_bd)) {
2579                        result = PTR_ERR(journal->j_dev_bd);
2580                        journal->j_dev_bd = NULL;
2581                        reiserfs_warning(super, "sh-458",
2582                                         "cannot init journal device '%s': %i",
2583                                         __bdevname(jdev, b), result);
2584                        return result;
2585                } else if (jdev != super->s_dev)
2586                        set_blocksize(journal->j_dev_bd, super->s_blocksize);
2587
2588                return 0;
2589        }
2590
2591        journal->j_dev_mode = blkdev_mode;
2592        journal->j_dev_bd = blkdev_get_by_path(jdev_name, blkdev_mode, journal);
2593        if (IS_ERR(journal->j_dev_bd)) {
2594                result = PTR_ERR(journal->j_dev_bd);
2595                journal->j_dev_bd = NULL;
2596                reiserfs_warning(super,
2597                                 "journal_init_dev: Cannot open '%s': %i",
2598                                 jdev_name, result);
2599                return result;
2600        }
2601
2602        set_blocksize(journal->j_dev_bd, super->s_blocksize);
2603        reiserfs_info(super,
2604                      "journal_init_dev: journal device: %s\n",
2605                      bdevname(journal->j_dev_bd, b));
2606        return 0;
2607}
2608
2609/**
2610 * When creating/tuning a file system user can assign some
2611 * journal params within boundaries which depend on the ratio
2612 * blocksize/standard_blocksize.
2613 *
2614 * For blocks >= standard_blocksize transaction size should
2615 * be not less then JOURNAL_TRANS_MIN_DEFAULT, and not more
2616 * then JOURNAL_TRANS_MAX_DEFAULT.
2617 *
2618 * For blocks < standard_blocksize these boundaries should be
2619 * decreased proportionally.
2620 */
2621#define REISERFS_STANDARD_BLKSIZE (4096)
2622
2623static int check_advise_trans_params(struct super_block *sb,
2624                                     struct reiserfs_journal *journal)
2625{
2626        if (journal->j_trans_max) {
2627                /* Non-default journal params.
2628                   Do sanity check for them. */
2629                int ratio = 1;
2630                if (sb->s_blocksize < REISERFS_STANDARD_BLKSIZE)
2631                        ratio = REISERFS_STANDARD_BLKSIZE / sb->s_blocksize;
2632
2633                if (journal->j_trans_max > JOURNAL_TRANS_MAX_DEFAULT / ratio ||
2634                    journal->j_trans_max < JOURNAL_TRANS_MIN_DEFAULT / ratio ||
2635                    SB_ONDISK_JOURNAL_SIZE(sb) / journal->j_trans_max <
2636                    JOURNAL_MIN_RATIO) {
2637                        reiserfs_warning(sb, "sh-462",
2638                                         "bad transaction max size (%u). "
2639                                         "FSCK?", journal->j_trans_max);
2640                        return 1;
2641                }
2642                if (journal->j_max_batch != (journal->j_trans_max) *
2643                        JOURNAL_MAX_BATCH_DEFAULT/JOURNAL_TRANS_MAX_DEFAULT) {
2644                        reiserfs_warning(sb, "sh-463",
2645                                         "bad transaction max batch (%u). "
2646                                         "FSCK?", journal->j_max_batch);
2647                        return 1;
2648                }
2649        } else {
2650                /* Default journal params.
2651                   The file system was created by old version
2652                   of mkreiserfs, so some fields contain zeros,
2653                   and we need to advise proper values for them */
2654                if (sb->s_blocksize != REISERFS_STANDARD_BLKSIZE) {
2655                        reiserfs_warning(sb, "sh-464", "bad blocksize (%u)",
2656                                         sb->s_blocksize);
2657                        return 1;
2658                }
2659                journal->j_trans_max = JOURNAL_TRANS_MAX_DEFAULT;
2660                journal->j_max_batch = JOURNAL_MAX_BATCH_DEFAULT;
2661                journal->j_max_commit_age = JOURNAL_MAX_COMMIT_AGE;
2662        }
2663        return 0;
2664}
2665
2666/*
2667** must be called once on fs mount.  calls journal_read for you
2668*/
2669int journal_init(struct super_block *sb, const char *j_dev_name,
2670                 int old_format, unsigned int commit_max_age)
2671{
2672        int num_cnodes = SB_ONDISK_JOURNAL_SIZE(sb) * 2;
2673        struct buffer_head *bhjh;
2674        struct reiserfs_super_block *rs;
2675        struct reiserfs_journal_header *jh;
2676        struct reiserfs_journal *journal;
2677        struct reiserfs_journal_list *jl;
2678        char b[BDEVNAME_SIZE];
2679        int ret;
2680
2681        journal = SB_JOURNAL(sb) = vzalloc(sizeof(struct reiserfs_journal));
2682        if (!journal) {
2683                reiserfs_warning(sb, "journal-1256",
2684                                 "unable to get memory for journal structure");
2685                return 1;
2686        }
2687        INIT_LIST_HEAD(&journal->j_bitmap_nodes);
2688        INIT_LIST_HEAD(&journal->j_prealloc_list);
2689        INIT_LIST_HEAD(&journal->j_working_list);
2690        INIT_LIST_HEAD(&journal->j_journal_list);
2691        journal->j_persistent_trans = 0;
2692        if (reiserfs_allocate_list_bitmaps(sb, journal->j_list_bitmap,
2693                                           reiserfs_bmap_count(sb)))
2694                goto free_and_return;
2695
2696        allocate_bitmap_nodes(sb);
2697
2698        /* reserved for journal area support */
2699        SB_JOURNAL_1st_RESERVED_BLOCK(sb) = (old_format ?
2700                                                 REISERFS_OLD_DISK_OFFSET_IN_BYTES
2701                                                 / sb->s_blocksize +
2702                                                 reiserfs_bmap_count(sb) +
2703                                                 1 :
2704                                                 REISERFS_DISK_OFFSET_IN_BYTES /
2705                                                 sb->s_blocksize + 2);
2706
2707        /* Sanity check to see is the standard journal fitting within first bitmap
2708           (actual for small blocksizes) */
2709        if (!SB_ONDISK_JOURNAL_DEVICE(sb) &&
2710            (SB_JOURNAL_1st_RESERVED_BLOCK(sb) +
2711             SB_ONDISK_JOURNAL_SIZE(sb) > sb->s_blocksize * 8)) {
2712                reiserfs_warning(sb, "journal-1393",
2713                                 "journal does not fit for area addressed "
2714                                 "by first of bitmap blocks. It starts at "
2715                                 "%u and its size is %u. Block size %ld",
2716                                 SB_JOURNAL_1st_RESERVED_BLOCK(sb),
2717                                 SB_ONDISK_JOURNAL_SIZE(sb),
2718                                 sb->s_blocksize);
2719                goto free_and_return;
2720        }
2721
2722        if (journal_init_dev(sb, journal, j_dev_name) != 0) {
2723                reiserfs_warning(sb, "sh-462",
2724                                 "unable to initialize jornal device");
2725                goto free_and_return;
2726        }
2727
2728        rs = SB_DISK_SUPER_BLOCK(sb);
2729
2730        /* read journal header */
2731        bhjh = journal_bread(sb,
2732                             SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
2733                             SB_ONDISK_JOURNAL_SIZE(sb));
2734        if (!bhjh) {
2735                reiserfs_warning(sb, "sh-459",
2736                                 "unable to read journal header");
2737                goto free_and_return;
2738        }
2739        jh = (struct reiserfs_journal_header *)(bhjh->b_data);
2740
2741        /* make sure that journal matches to the super block */
2742        if (is_reiserfs_jr(rs)
2743            && (le32_to_cpu(jh->jh_journal.jp_journal_magic) !=
2744                sb_jp_journal_magic(rs))) {
2745                reiserfs_warning(sb, "sh-460",
2746                                 "journal header magic %x (device %s) does "
2747                                 "not match to magic found in super block %x",
2748                                 jh->jh_journal.jp_journal_magic,
2749                                 bdevname(journal->j_dev_bd, b),
2750                                 sb_jp_journal_magic(rs));
2751                brelse(bhjh);
2752                goto free_and_return;
2753        }
2754
2755        journal->j_trans_max = le32_to_cpu(jh->jh_journal.jp_journal_trans_max);
2756        journal->j_max_batch = le32_to_cpu(jh->jh_journal.jp_journal_max_batch);
2757        journal->j_max_commit_age =
2758            le32_to_cpu(jh->jh_journal.jp_journal_max_commit_age);
2759        journal->j_max_trans_age = JOURNAL_MAX_TRANS_AGE;
2760
2761        if (check_advise_trans_params(sb, journal) != 0)
2762                goto free_and_return;
2763        journal->j_default_max_commit_age = journal->j_max_commit_age;
2764
2765        if (commit_max_age != 0) {
2766                journal->j_max_commit_age = commit_max_age;
2767                journal->j_max_trans_age = commit_max_age;
2768        }
2769
2770        reiserfs_info(sb, "journal params: device %s, size %u, "
2771                      "journal first block %u, max trans len %u, max batch %u, "
2772                      "max commit age %u, max trans age %u\n",
2773                      bdevname(journal->j_dev_bd, b),
2774                      SB_ONDISK_JOURNAL_SIZE(sb),
2775                      SB_ONDISK_JOURNAL_1st_BLOCK(sb),
2776                      journal->j_trans_max,
2777                      journal->j_max_batch,
2778                      journal->j_max_commit_age, journal->j_max_trans_age);
2779
2780        brelse(bhjh);
2781
2782        journal->j_list_bitmap_index = 0;
2783        journal_list_init(sb);
2784
2785        memset(journal->j_list_hash_table, 0,
2786               JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *));
2787
2788        INIT_LIST_HEAD(&journal->j_dirty_buffers);
2789        spin_lock_init(&journal->j_dirty_buffers_lock);
2790
2791        journal->j_start = 0;
2792        journal->j_len = 0;
2793        journal->j_len_alloc = 0;
2794        atomic_set(&(journal->j_wcount), 0);
2795        atomic_set(&(journal->j_async_throttle), 0);
2796        journal->j_bcount = 0;
2797        journal->j_trans_start_time = 0;
2798        journal->j_last = NULL;
2799        journal->j_first = NULL;
2800        init_waitqueue_head(&(journal->j_join_wait));
2801        mutex_init(&journal->j_mutex);
2802        mutex_init(&journal->j_flush_mutex);
2803
2804        journal->j_trans_id = 10;
2805        journal->j_mount_id = 10;
2806        journal->j_state = 0;
2807        atomic_set(&(journal->j_jlock), 0);
2808        journal->j_cnode_free_list = allocate_cnodes(num_cnodes);
2809        journal->j_cnode_free_orig = journal->j_cnode_free_list;
2810        journal->j_cnode_free = journal->j_cnode_free_list ? num_cnodes : 0;
2811        journal->j_cnode_used = 0;
2812        journal->j_must_wait = 0;
2813
2814        if (journal->j_cnode_free == 0) {
2815                reiserfs_warning(sb, "journal-2004", "Journal cnode memory "
2816                                 "allocation failed (%ld bytes). Journal is "
2817                                 "too large for available memory. Usually "
2818                                 "this is due to a journal that is too large.",
2819                                 sizeof (struct reiserfs_journal_cnode) * num_cnodes);
2820                goto free_and_return;
2821        }
2822
2823        init_journal_hash(sb);
2824        jl = journal->j_current_jl;
2825
2826        /*
2827         * get_list_bitmap() may call flush_commit_list() which
2828         * requires the lock. Calling flush_commit_list() shouldn't happen
2829         * this early but I like to be paranoid.
2830         */
2831        reiserfs_write_lock(sb);
2832        jl->j_list_bitmap = get_list_bitmap(sb, jl);
2833        reiserfs_write_unlock(sb);
2834        if (!jl->j_list_bitmap) {
2835                reiserfs_warning(sb, "journal-2005",
2836                                 "get_list_bitmap failed for journal list 0");
2837                goto free_and_return;
2838        }
2839
2840        /*
2841         * Journal_read needs to be inspected in order to push down
2842         * the lock further inside (or even remove it).
2843         */
2844        reiserfs_write_lock(sb);
2845        ret = journal_read(sb);
2846        reiserfs_write_unlock(sb);
2847        if (ret < 0) {
2848                reiserfs_warning(sb, "reiserfs-2006",
2849                                 "Replay Failure, unable to mount");
2850                goto free_and_return;
2851        }
2852
2853        reiserfs_mounted_fs_count++;
2854        if (reiserfs_mounted_fs_count <= 1)
2855                commit_wq = alloc_workqueue("reiserfs", WQ_MEM_RECLAIM, 0);
2856
2857        INIT_DELAYED_WORK(&journal->j_work, flush_async_commits);
2858        journal->j_work_sb = sb;
2859        return 0;
2860      free_and_return:
2861        free_journal_ram(sb);
2862        return 1;
2863}
2864
2865/*
2866** test for a polite end of the current transaction.  Used by file_write, and should
2867** be used by delete to make sure they don't write more than can fit inside a single
2868** transaction
2869*/
2870int journal_transaction_should_end(struct reiserfs_transaction_handle *th,
2871                                   int new_alloc)
2872{
2873        struct reiserfs_journal *journal = SB_JOURNAL(th->t_super);
2874        time_t now = get_seconds();
2875        /* cannot restart while nested */
2876        BUG_ON(!th->t_trans_id);
2877        if (th->t_refcount > 1)
2878                return 0;
2879        if (journal->j_must_wait > 0 ||
2880            (journal->j_len_alloc + new_alloc) >= journal->j_max_batch ||
2881            atomic_read(&(journal->j_jlock)) ||
2882            (now - journal->j_trans_start_time) > journal->j_max_trans_age ||
2883            journal->j_cnode_free < (journal->j_trans_max * 3)) {
2884                return 1;
2885        }
2886
2887        journal->j_len_alloc += new_alloc;
2888        th->t_blocks_allocated += new_alloc ;
2889        return 0;
2890}
2891
2892/* this must be called inside a transaction
2893*/
2894void reiserfs_block_writes(struct reiserfs_transaction_handle *th)
2895{
2896        struct reiserfs_journal *journal = SB_JOURNAL(th->t_super);
2897        BUG_ON(!th->t_trans_id);
2898        journal->j_must_wait = 1;
2899        set_bit(J_WRITERS_BLOCKED, &journal->j_state);
2900        return;
2901}
2902
2903/* this must be called without a transaction started
2904*/
2905void reiserfs_allow_writes(struct super_block *s)
2906{
2907        struct reiserfs_journal *journal = SB_JOURNAL(s);
2908        clear_bit(J_WRITERS_BLOCKED, &journal->j_state);
2909        wake_up(&journal->j_join_wait);
2910}
2911
2912/* this must be called without a transaction started
2913*/
2914void reiserfs_wait_on_write_block(struct super_block *s)
2915{
2916        struct reiserfs_journal *journal = SB_JOURNAL(s);
2917        wait_event(journal->j_join_wait,
2918                   !test_bit(J_WRITERS_BLOCKED, &journal->j_state));
2919}
2920
2921static void queue_log_writer(struct super_block *s)
2922{
2923        wait_queue_t wait;
2924        struct reiserfs_journal *journal = SB_JOURNAL(s);
2925        set_bit(J_WRITERS_QUEUED, &journal->j_state);
2926
2927        /*
2928         * we don't want to use wait_event here because
2929         * we only want to wait once.
2930         */
2931        init_waitqueue_entry(&wait, current);
2932        add_wait_queue(&journal->j_join_wait, &wait);
2933        set_current_state(TASK_UNINTERRUPTIBLE);
2934        if (test_bit(J_WRITERS_QUEUED, &journal->j_state)) {
2935                reiserfs_write_unlock(s);
2936                schedule();
2937                reiserfs_write_lock(s);
2938        }
2939        __set_current_state(TASK_RUNNING);
2940        remove_wait_queue(&journal->j_join_wait, &wait);
2941}
2942
2943static void wake_queued_writers(struct super_block *s)
2944{
2945        struct reiserfs_journal *journal = SB_JOURNAL(s);
2946        if (test_and_clear_bit(J_WRITERS_QUEUED, &journal->j_state))
2947                wake_up(&journal->j_join_wait);
2948}
2949
2950static void let_transaction_grow(struct super_block *sb, unsigned int trans_id)
2951{
2952        struct reiserfs_journal *journal = SB_JOURNAL(sb);
2953        unsigned long bcount = journal->j_bcount;
2954        while (1) {
2955                reiserfs_write_unlock(sb);
2956                schedule_timeout_uninterruptible(1);
2957                reiserfs_write_lock(sb);
2958                journal->j_current_jl->j_state |= LIST_COMMIT_PENDING;
2959                while ((atomic_read(&journal->j_wcount) > 0 ||
2960                        atomic_read(&journal->j_jlock)) &&
2961                       journal->j_trans_id == trans_id) {
2962                        queue_log_writer(sb);
2963                }
2964                if (journal->j_trans_id != trans_id)
2965                        break;
2966                if (bcount == journal->j_bcount)
2967                        break;
2968                bcount = journal->j_bcount;
2969        }
2970}
2971
2972/* join == true if you must join an existing transaction.
2973** join == false if you can deal with waiting for others to finish
2974**
2975** this will block until the transaction is joinable.  send the number of blocks you
2976** expect to use in nblocks.
2977*/
2978static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
2979                              struct super_block *sb, unsigned long nblocks,
2980                              int join)
2981{
2982        time_t now = get_seconds();
2983        unsigned int old_trans_id;
2984        struct reiserfs_journal *journal = SB_JOURNAL(sb);
2985        struct reiserfs_transaction_handle myth;
2986        int sched_count = 0;
2987        int retval;
2988
2989        reiserfs_check_lock_depth(sb, "journal_begin");
2990        BUG_ON(nblocks > journal->j_trans_max);
2991
2992        PROC_INFO_INC(sb, journal.journal_being);
2993        /* set here for journal_join */
2994        th->t_refcount = 1;
2995        th->t_super = sb;
2996
2997      relock:
2998        lock_journal(sb);
2999        if (join != JBEGIN_ABORT && reiserfs_is_journal_aborted(journal)) {
3000                unlock_journal(sb);
3001                retval = journal->j_errno;
3002                goto out_fail;
3003        }
3004        journal->j_bcount++;
3005
3006        if (test_bit(J_WRITERS_BLOCKED, &journal->j_state)) {
3007                unlock_journal(sb);
3008                reiserfs_write_unlock(sb);
3009                reiserfs_wait_on_write_block(sb);
3010                reiserfs_write_lock(sb);
3011                PROC_INFO_INC(sb, journal.journal_relock_writers);
3012                goto relock;
3013        }
3014        now = get_seconds();
3015
3016        /* if there is no room in the journal OR
3017         ** if this transaction is too old, and we weren't called joinable, wait for it to finish before beginning
3018         ** we don't sleep if there aren't other writers
3019         */
3020
3021        if ((!join && journal->j_must_wait > 0) ||
3022            (!join
3023             && (journal->j_len_alloc + nblocks + 2) >= journal->j_max_batch)
3024            || (!join && atomic_read(&journal->j_wcount) > 0
3025                && journal->j_trans_start_time > 0
3026                && (now - journal->j_trans_start_time) >
3027                journal->j_max_trans_age) || (!join
3028                                              && atomic_read(&journal->j_jlock))
3029            || (!join && journal->j_cnode_free < (journal->j_trans_max * 3))) {
3030
3031                old_trans_id = journal->j_trans_id;
3032                unlock_journal(sb);     /* allow others to finish this transaction */
3033
3034                if (!join && (journal->j_len_alloc + nblocks + 2) >=
3035                    journal->j_max_batch &&
3036                    ((journal->j_len + nblocks + 2) * 100) <
3037                    (journal->j_len_alloc * 75)) {
3038                        if (atomic_read(&journal->j_wcount) > 10) {
3039                                sched_count++;
3040                                queue_log_writer(sb);
3041                                goto relock;
3042                        }
3043                }
3044                /* don't mess with joining the transaction if all we have to do is
3045                 * wait for someone else to do a commit
3046                 */
3047                if (atomic_read(&journal->j_jlock)) {
3048                        while (journal->j_trans_id == old_trans_id &&
3049                               atomic_read(&journal->j_jlock)) {
3050                                queue_log_writer(sb);
3051                        }
3052                        goto relock;
3053                }
3054                retval = journal_join(&myth, sb, 1);
3055                if (retval)
3056                        goto out_fail;
3057
3058                /* someone might have ended the transaction while we joined */
3059                if (old_trans_id != journal->j_trans_id) {
3060                        retval = do_journal_end(&myth, sb, 1, 0);
3061                } else {
3062                        retval = do_journal_end(&myth, sb, 1, COMMIT_NOW);
3063                }
3064
3065                if (retval)
3066                        goto out_fail;
3067
3068                PROC_INFO_INC(sb, journal.journal_relock_wcount);
3069                goto relock;
3070        }
3071        /* we are the first writer, set trans_id */
3072        if (journal->j_trans_start_time == 0) {
3073                journal->j_trans_start_time = get_seconds();
3074        }
3075        atomic_inc(&(journal->j_wcount));
3076        journal->j_len_alloc += nblocks;
3077        th->t_blocks_logged = 0;
3078        th->t_blocks_allocated = nblocks;
3079        th->t_trans_id = journal->j_trans_id;
3080        unlock_journal(sb);
3081        INIT_LIST_HEAD(&th->t_list);
3082        return 0;
3083
3084      out_fail:
3085        memset(th, 0, sizeof(*th));
3086        /* Re-set th->t_super, so we can properly keep track of how many
3087         * persistent transactions there are. We need to do this so if this
3088         * call is part of a failed restart_transaction, we can free it later */
3089        th->t_super = sb;
3090        return retval;
3091}
3092
3093struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct
3094                                                                    super_block
3095                                                                    *s,
3096                                                                    int nblocks)
3097{
3098        int ret;
3099        struct reiserfs_transaction_handle *th;
3100
3101        /* if we're nesting into an existing transaction.  It will be
3102         ** persistent on its own
3103         */
3104        if (reiserfs_transaction_running(s)) {
3105                th = current->journal_info;
3106                th->t_refcount++;
3107                BUG_ON(th->t_refcount < 2);
3108                
3109                return th;
3110        }
3111        th = kmalloc(sizeof(struct reiserfs_transaction_handle), GFP_NOFS);
3112        if (!th)
3113                return NULL;
3114        ret = journal_begin(th, s, nblocks);
3115        if (ret) {
3116                kfree(th);
3117                return NULL;
3118        }
3119
3120        SB_JOURNAL(s)->j_persistent_trans++;
3121        return th;
3122}
3123
3124int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *th)
3125{
3126        struct super_block *s = th->t_super;
3127        int ret = 0;
3128        if (th->t_trans_id)
3129                ret = journal_end(th, th->t_super, th->t_blocks_allocated);
3130        else
3131                ret = -EIO;
3132        if (th->t_refcount == 0) {
3133                SB_JOURNAL(s)->j_persistent_trans--;
3134                kfree(th);
3135        }
3136        return ret;
3137}
3138
3139static int journal_join(struct reiserfs_transaction_handle *th,
3140                        struct super_block *sb, unsigned long nblocks)
3141{
3142        struct reiserfs_transaction_handle *cur_th = current->journal_info;
3143
3144        /* this keeps do_journal_end from NULLing out the current->journal_info
3145         ** pointer
3146         */
3147        th->t_handle_save = cur_th;
3148        BUG_ON(cur_th && cur_th->t_refcount > 1);
3149        return do_journal_begin_r(th, sb, nblocks, JBEGIN_JOIN);
3150}
3151
3152int journal_join_abort(struct reiserfs_transaction_handle *th,
3153                       struct super_block *sb, unsigned long nblocks)
3154{
3155        struct reiserfs_transaction_handle *cur_th = current->journal_info;
3156
3157        /* this keeps do_journal_end from NULLing out the current->journal_info
3158         ** pointer
3159         */
3160        th->t_handle_save = cur_th;
3161        BUG_ON(cur_th && cur_th->t_refcount > 1);
3162        return do_journal_begin_r(th, sb, nblocks, JBEGIN_ABORT);
3163}
3164
3165int journal_begin(struct reiserfs_transaction_handle *th,
3166                  struct super_block *sb, unsigned long nblocks)
3167{
3168        struct reiserfs_transaction_handle *cur_th = current->journal_info;
3169        int ret;
3170
3171        th->t_handle_save = NULL;
3172        if (cur_th) {
3173                /* we are nesting into the current transaction */
3174                if (cur_th->t_super == sb) {
3175                        BUG_ON(!cur_th->t_refcount);
3176                        cur_th->t_refcount++;
3177                        memcpy(th, cur_th, sizeof(*th));
3178                        if (th->t_refcount <= 1)
3179                                reiserfs_warning(sb, "reiserfs-2005",
3180                                                 "BAD: refcount <= 1, but "
3181                                                 "journal_info != 0");
3182                        return 0;
3183                } else {
3184                        /* we've ended up with a handle from a different filesystem.
3185                         ** save it and restore on journal_end.  This should never
3186                         ** really happen...
3187                         */
3188                        reiserfs_warning(sb, "clm-2100",
3189                                         "nesting info a different FS");
3190                        th->t_handle_save = current->journal_info;
3191                        current->journal_info = th;
3192                }
3193        } else {
3194                current->journal_info = th;
3195        }
3196        ret = do_journal_begin_r(th, sb, nblocks, JBEGIN_REG);
3197        BUG_ON(current->journal_info != th);
3198
3199        /* I guess this boils down to being the reciprocal of clm-2100 above.
3200         * If do_journal_begin_r fails, we need to put it back, since journal_end
3201         * won't be called to do it. */
3202        if (ret)
3203                current->journal_info = th->t_handle_save;
3204        else
3205                BUG_ON(!th->t_refcount);
3206
3207        return ret;
3208}
3209
3210/*
3211** puts bh into the current transaction.  If it was already there, reorders removes the
3212** old pointers from the hash, and puts new ones in (to make sure replay happen in the right order).
3213**
3214** if it was dirty, cleans and files onto the clean list.  I can't let it be dirty again until the
3215** transaction is committed.
3216**
3217** if j_len, is bigger than j_len_alloc, it pushes j_len_alloc to 10 + j_len.
3218*/
3219int journal_mark_dirty(struct reiserfs_transaction_handle *th,
3220                       struct super_block *sb, struct buffer_head *bh)
3221{
3222        struct reiserfs_journal *journal = SB_JOURNAL(sb);
3223        struct reiserfs_journal_cnode *cn = NULL;
3224        int count_already_incd = 0;
3225        int prepared = 0;
3226        BUG_ON(!th->t_trans_id);
3227
3228        PROC_INFO_INC(sb, journal.mark_dirty);
3229        if (th->t_trans_id != journal->j_trans_id) {
3230                reiserfs_panic(th->t_super, "journal-1577",
3231                               "handle trans id %ld != current trans id %ld",
3232                               th->t_trans_id, journal->j_trans_id);
3233        }
3234
3235        sb->s_dirt = 1;
3236
3237        prepared = test_clear_buffer_journal_prepared(bh);
3238        clear_buffer_journal_restore_dirty(bh);
3239        /* already in this transaction, we are done */
3240        if (buffer_journaled(bh)) {
3241                PROC_INFO_INC(sb, journal.mark_dirty_already);
3242                return 0;
3243        }
3244
3245        /* this must be turned into a panic instead of a warning.  We can't allow
3246         ** a dirty or journal_dirty or locked buffer to be logged, as some changes
3247         ** could get to disk too early.  NOT GOOD.
3248         */
3249        if (!prepared || buffer_dirty(bh)) {
3250                reiserfs_warning(sb, "journal-1777",
3251                                 "buffer %llu bad state "
3252                                 "%cPREPARED %cLOCKED %cDIRTY %cJDIRTY_WAIT",
3253                                 (unsigned long long)bh->b_blocknr,
3254                                 prepared ? ' ' : '!',
3255                                 buffer_locked(bh) ? ' ' : '!',
3256                                 buffer_dirty(bh) ? ' ' : '!',
3257                                 buffer_journal_dirty(bh) ? ' ' : '!');
3258        }
3259
3260        if (atomic_read(&(journal->j_wcount)) <= 0) {
3261                reiserfs_warning(sb, "journal-1409",
3262                                 "returning because j_wcount was %d",
3263                                 atomic_read(&(journal->j_wcount)));
3264                return 1;
3265        }
3266        /* this error means I've screwed up, and we've overflowed the transaction.
3267         ** Nothing can be done here, except make the FS readonly or panic.
3268         */
3269        if (journal->j_len >= journal->j_trans_max) {
3270                reiserfs_panic(th->t_super, "journal-1413",
3271                               "j_len (%lu) is too big",
3272                               journal->j_len);
3273        }
3274
3275        if (buffer_journal_dirty(bh)) {
3276                count_already_incd = 1;
3277                PROC_INFO_INC(sb, journal.mark_dirty_notjournal);
3278                clear_buffer_journal_dirty(bh);
3279        }
3280
3281        if (journal->j_len > journal->j_len_alloc) {
3282                journal->j_len_alloc = journal->j_len + JOURNAL_PER_BALANCE_CNT;
3283        }
3284
3285        set_buffer_journaled(bh);
3286
3287        /* now put this guy on the end */
3288        if (!cn) {
3289                cn = get_cnode(sb);
3290                if (!cn) {
3291                        reiserfs_panic(sb, "journal-4", "get_cnode failed!");
3292                }
3293
3294                if (th->t_blocks_logged == th->t_blocks_allocated) {
3295                        th->t_blocks_allocated += JOURNAL_PER_BALANCE_CNT;
3296                        journal->j_len_alloc += JOURNAL_PER_BALANCE_CNT;
3297                }
3298                th->t_blocks_logged++;
3299                journal->j_len++;
3300
3301                cn->bh = bh;
3302                cn->blocknr = bh->b_blocknr;
3303                cn->sb = sb;
3304                cn->jlist = NULL;
3305                insert_journal_hash(journal->j_hash_table, cn);
3306                if (!count_already_incd) {
3307                        get_bh(bh);
3308                }
3309        }
3310        cn->next = NULL;
3311        cn->prev = journal->j_last;
3312        cn->bh = bh;
3313        if (journal->j_last) {
3314                journal->j_last->next = cn;
3315                journal->j_last = cn;
3316        } else {
3317                journal->j_first = cn;
3318                journal->j_last = cn;
3319        }
3320        return 0;
3321}
3322
3323int journal_end(struct reiserfs_transaction_handle *th,
3324                struct super_block *sb, unsigned long nblocks)
3325{
3326        if (!current->journal_info && th->t_refcount > 1)
3327                reiserfs_warning(sb, "REISER-NESTING",
3328                                 "th NULL, refcount %d", th->t_refcount);
3329
3330        if (!th->t_trans_id) {
3331                WARN_ON(1);
3332                return -EIO;
3333        }
3334
3335        th->t_refcount--;
3336        if (th->t_refcount > 0) {
3337                struct reiserfs_transaction_handle *cur_th =
3338                    current->journal_info;
3339
3340                /* we aren't allowed to close a nested transaction on a different
3341                 ** filesystem from the one in the task struct
3342                 */
3343                BUG_ON(cur_th->t_super != th->t_super);
3344
3345                if (th != cur_th) {
3346                        memcpy(current->journal_info, th, sizeof(*th));
3347                        th->t_trans_id = 0;
3348                }
3349                return 0;
3350        } else {
3351                return do_journal_end(th, sb, nblocks, 0);
3352        }
3353}
3354
3355/* removes from the current transaction, relsing and descrementing any counters.
3356** also files the removed buffer directly onto the clean list
3357**
3358** called by journal_mark_freed when a block has been deleted
3359**
3360** returns 1 if it cleaned and relsed the buffer. 0 otherwise
3361*/
3362static int remove_from_transaction(struct super_block *sb,
3363                                   b_blocknr_t blocknr, int already_cleaned)
3364{
3365        struct buffer_head *bh;
3366        struct reiserfs_journal_cnode *cn;
3367        struct reiserfs_journal *journal = SB_JOURNAL(sb);
3368        int ret = 0;
3369
3370        cn = get_journal_hash_dev(sb, journal->j_hash_table, blocknr);
3371        if (!cn || !cn->bh) {
3372                return ret;
3373        }
3374        bh = cn->bh;
3375        if (cn->prev) {
3376                cn->prev->next = cn->next;
3377        }
3378        if (cn->next) {
3379                cn->next->prev = cn->prev;
3380        }
3381        if (cn == journal->j_first) {
3382                journal->j_first = cn->next;
3383        }
3384        if (cn == journal->j_last) {
3385                journal->j_last = cn->prev;
3386        }
3387        if (bh)
3388                remove_journal_hash(sb, journal->j_hash_table, NULL,
3389                                    bh->b_blocknr, 0);
3390        clear_buffer_journaled(bh);     /* don't log this one */
3391
3392        if (!already_cleaned) {
3393                clear_buffer_journal_dirty(bh);
3394                clear_buffer_dirty(bh);
3395                clear_buffer_journal_test(bh);
3396                put_bh(bh);
3397                if (atomic_read(&(bh->b_count)) < 0) {
3398                        reiserfs_warning(sb, "journal-1752",
3399                                         "b_count < 0");
3400                }
3401                ret = 1;
3402        }
3403        journal->j_len--;
3404        journal->j_len_alloc--;
3405        free_cnode(sb, cn);
3406        return ret;
3407}
3408
3409/*
3410** for any cnode in a journal list, it can only be dirtied of all the
3411** transactions that include it are committed to disk.
3412** this checks through each transaction, and returns 1 if you are allowed to dirty,
3413** and 0 if you aren't
3414**
3415** it is called by dirty_journal_list, which is called after flush_commit_list has gotten all the log
3416** blocks for a given transaction on disk
3417**
3418*/
3419static int can_dirty(struct reiserfs_journal_cnode *cn)
3420{
3421        struct super_block *sb = cn->sb;
3422        b_blocknr_t blocknr = cn->blocknr;
3423        struct reiserfs_journal_cnode *cur = cn->hprev;
3424        int can_dirty = 1;
3425
3426        /* first test hprev.  These are all newer than cn, so any node here
3427         ** with the same block number and dev means this node can't be sent
3428         ** to disk right now.
3429         */
3430        while (cur && can_dirty) {
3431                if (cur->jlist && cur->bh && cur->blocknr && cur->sb == sb &&
3432                    cur->blocknr == blocknr) {
3433                        can_dirty = 0;
3434                }
3435                cur = cur->hprev;
3436        }
3437        /* then test hnext.  These are all older than cn.  As long as they
3438         ** are committed to the log, it is safe to write cn to disk
3439         */
3440        cur = cn->hnext;
3441        while (cur && can_dirty) {
3442                if (cur->jlist && cur->jlist->j_len > 0 &&
3443                    atomic_read(&(cur->jlist->j_commit_left)) > 0 && cur->bh &&
3444                    cur->blocknr && cur->sb == sb && cur->blocknr == blocknr) {
3445                        can_dirty = 0;
3446                }
3447                cur = cur->hnext;
3448        }
3449        return can_dirty;
3450}
3451
3452/* syncs the commit blocks, but does not force the real buffers to disk
3453** will wait until the current transaction is done/committed before returning
3454*/
3455int journal_end_sync(struct reiserfs_transaction_handle *th,
3456                     struct super_block *sb, unsigned long nblocks)
3457{
3458        struct reiserfs_journal *journal = SB_JOURNAL(sb);
3459
3460        BUG_ON(!th->t_trans_id);
3461        /* you can sync while nested, very, very bad */
3462        BUG_ON(th->t_refcount > 1);
3463        if (journal->j_len == 0) {
3464                reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb),
3465                                             1);
3466                journal_mark_dirty(th, sb, SB_BUFFER_WITH_SB(sb));
3467        }
3468        return do_journal_end(th, sb, nblocks, COMMIT_NOW | WAIT);
3469}
3470
3471/*
3472** writeback the pending async commits to disk
3473*/
3474static void flush_async_commits(struct work_struct *work)
3475{
3476        struct reiserfs_journal *journal =
3477                container_of(work, struct reiserfs_journal, j_work.work);
3478        struct super_block *sb = journal->j_work_sb;
3479        struct reiserfs_journal_list *jl;
3480        struct list_head *entry;
3481
3482        reiserfs_write_lock(sb);
3483        if (!list_empty(&journal->j_journal_list)) {
3484                /* last entry is the youngest, commit it and you get everything */
3485                entry = journal->j_journal_list.prev;
3486                jl = JOURNAL_LIST_ENTRY(entry);
3487                flush_commit_list(sb, jl, 1);
3488        }
3489        reiserfs_write_unlock(sb);
3490}
3491
3492/*
3493** flushes any old transactions to disk
3494** ends the current transaction if it is too old
3495*/
3496int reiserfs_flush_old_commits(struct super_block *sb)
3497{
3498        time_t now;
3499        struct reiserfs_transaction_handle th;
3500        struct reiserfs_journal *journal = SB_JOURNAL(sb);
3501
3502        now = get_seconds();
3503        /* safety check so we don't flush while we are replaying the log during
3504         * mount
3505         */
3506        if (list_empty(&journal->j_journal_list)) {
3507                return 0;
3508        }
3509
3510        /* check the current transaction.  If there are no writers, and it is
3511         * too old, finish it, and force the commit blocks to disk
3512         */
3513        if (atomic_read(&journal->j_wcount) <= 0 &&
3514            journal->j_trans_start_time > 0 &&
3515            journal->j_len > 0 &&
3516            (now - journal->j_trans_start_time) > journal->j_max_trans_age) {
3517                if (!journal_join(&th, sb, 1)) {
3518                        reiserfs_prepare_for_journal(sb,
3519                                                     SB_BUFFER_WITH_SB(sb),
3520                                                     1);
3521                        journal_mark_dirty(&th, sb,
3522                                           SB_BUFFER_WITH_SB(sb));
3523
3524                        /* we're only being called from kreiserfsd, it makes no sense to do
3525                         ** an async commit so that kreiserfsd can do it later
3526                         */
3527                        do_journal_end(&th, sb, 1, COMMIT_NOW | WAIT);
3528                }
3529        }
3530        return sb->s_dirt;
3531}
3532
3533/*
3534** returns 0 if do_journal_end should return right away, returns 1 if do_journal_end should finish the commit
3535**
3536** if the current transaction is too old, but still has writers, this will wait on j_join_wait until all
3537** the writers are done.  By the time it wakes up, the transaction it was called has already ended, so it just
3538** flushes the commit list and returns 0.
3539**
3540** Won't batch when flush or commit_now is set.  Also won't batch when others are waiting on j_join_wait.
3541**
3542** Note, we can't allow the journal_end to proceed while there are still writers in the log.
3543*/
3544static int check_journal_end(struct reiserfs_transaction_handle *th,
3545                             struct super_block *sb, unsigned long nblocks,
3546                             int flags)
3547{
3548
3549        time_t now;
3550        int flush = flags & FLUSH_ALL;
3551        int commit_now = flags & COMMIT_NOW;
3552        int wait_on_commit = flags & WAIT;
3553        struct reiserfs_journal_list *jl;
3554        struct reiserfs_journal *journal = SB_JOURNAL(sb);
3555
3556        BUG_ON(!th->t_trans_id);
3557
3558        if (th->t_trans_id != journal->j_trans_id) {
3559                reiserfs_panic(th->t_super, "journal-1577",
3560                               "handle trans id %ld != current trans id %ld",
3561                               th->t_trans_id, journal->j_trans_id);
3562        }
3563
3564        journal->j_len_alloc -= (th->t_blocks_allocated - th->t_blocks_logged);
3565        if (atomic_read(&(journal->j_wcount)) > 0) {    /* <= 0 is allowed.  unmounting might not call begin */
3566                atomic_dec(&(journal->j_wcount));
3567        }
3568
3569        /* BUG, deal with case where j_len is 0, but people previously freed blocks need to be released
3570         ** will be dealt with by next transaction that actually writes something, but should be taken
3571         ** care of in this trans
3572         */
3573        BUG_ON(journal->j_len == 0);
3574
3575        /* if wcount > 0, and we are called to with flush or commit_now,
3576         ** we wait on j_join_wait.  We will wake up when the last writer has
3577         ** finished the transaction, and started it on its way to the disk.
3578         ** Then, we flush the commit or journal list, and just return 0
3579         ** because the rest of journal end was already done for this transaction.
3580         */
3581        if (atomic_read(&(journal->j_wcount)) > 0) {
3582                if (flush || commit_now) {
3583                        unsigned trans_id;
3584
3585                        jl = journal->j_current_jl;
3586                        trans_id = jl->j_trans_id;
3587                        if (wait_on_commit)
3588                                jl->j_state |= LIST_COMMIT_PENDING;
3589                        atomic_set(&(journal->j_jlock), 1);
3590                        if (flush) {
3591                                journal->j_next_full_flush = 1;
3592                        }
3593                        unlock_journal(sb);
3594
3595                        /* sleep while the current transaction is still j_jlocked */
3596                        while (journal->j_trans_id == trans_id) {
3597                                if (atomic_read(&journal->j_jlock)) {
3598                                        queue_log_writer(sb);
3599                                } else {
3600                                        lock_journal(sb);
3601                                        if (journal->j_trans_id == trans_id) {
3602                                                atomic_set(&(journal->j_jlock),
3603                                                           1);
3604                                        }
3605                                        unlock_journal(sb);
3606                                }
3607                        }
3608                        BUG_ON(journal->j_trans_id == trans_id);
3609                        
3610                        if (commit_now
3611                            && journal_list_still_alive(sb, trans_id)
3612                            && wait_on_commit) {
3613                                flush_commit_list(sb, jl, 1);
3614                        }
3615                        return 0;
3616                }
3617                unlock_journal(sb);
3618                return 0;
3619        }
3620
3621        /* deal with old transactions where we are the last writers */
3622        now = get_seconds();
3623        if ((now - journal->j_trans_start_time) > journal->j_max_trans_age) {
3624                commit_now = 1;
3625                journal->j_next_async_flush = 1;
3626        }
3627        /* don't batch when someone is waiting on j_join_wait */
3628        /* don't batch when syncing the commit or flushing the whole trans */
3629        if (!(journal->j_must_wait > 0) && !(atomic_read(&(journal->j_jlock)))
3630            && !flush && !commit_now && (journal->j_len < journal->j_max_batch)
3631            && journal->j_len_alloc < journal->j_max_batch
3632            && journal->j_cnode_free > (journal->j_trans_max * 3)) {
3633                journal->j_bcount++;
3634                unlock_journal(sb);
3635                return 0;
3636        }
3637
3638        if (journal->j_start > SB_ONDISK_JOURNAL_SIZE(sb)) {
3639                reiserfs_panic(sb, "journal-003",
3640                               "j_start (%ld) is too high",
3641                               journal->j_start);
3642        }
3643        return 1;
3644}
3645
3646/*
3647** Does all the work that makes deleting blocks safe.
3648** when deleting a block mark BH_JNew, just remove it from the current transaction, clean it's buffer_head and move on.
3649**
3650** otherwise:
3651** set a bit for the block in the journal bitmap.  That will prevent it from being allocated for unformatted nodes
3652** before this transaction has finished.
3653**
3654** mark any cnodes for this block as BLOCK_FREED, and clear their bh pointers.  That will prevent any old transactions with
3655** this block from trying to flush to the real location.  Since we aren't removing the cnode from the journal_list_hash,
3656** the block can't be reallocated yet.
3657**
3658** Then remove it from the current transaction, decrementing any counters and filing it on the clean list.
3659*/
3660int journal_mark_freed(struct reiserfs_transaction_handle *th,
3661                       struct super_block *sb, b_blocknr_t blocknr)
3662{
3663        struct reiserfs_journal *journal = SB_JOURNAL(sb);
3664        struct reiserfs_journal_cnode *cn = NULL;
3665        struct buffer_head *bh = NULL;
3666        struct reiserfs_list_bitmap *jb = NULL;
3667        int cleaned = 0;
3668        BUG_ON(!th->t_trans_id);
3669
3670        cn = get_journal_hash_dev(sb, journal->j_hash_table, blocknr);
3671        if (cn && cn->bh) {
3672                bh = cn->bh;
3673                get_bh(bh);
3674        }
3675        /* if it is journal new, we just remove it from this transaction */
3676        if (bh && buffer_journal_new(bh)) {
3677                clear_buffer_journal_new(bh);
3678                clear_prepared_bits(bh);
3679                reiserfs_clean_and_file_buffer(bh);
3680                cleaned = remove_from_transaction(sb, blocknr, cleaned);
3681        } else {
3682                /* set the bit for this block in the journal bitmap for this transaction */
3683                jb = journal->j_current_jl->j_list_bitmap;
3684                if (!jb) {
3685                        reiserfs_panic(sb, "journal-1702",
3686                                       "journal_list_bitmap is NULL");
3687                }
3688                set_bit_in_list_bitmap(sb, blocknr, jb);
3689
3690                /* Note, the entire while loop is not allowed to schedule.  */
3691
3692                if (bh) {
3693                        clear_prepared_bits(bh);
3694                        reiserfs_clean_and_file_buffer(bh);
3695                }
3696                cleaned = remove_from_transaction(sb, blocknr, cleaned);
3697
3698                /* find all older transactions with this block, make sure they don't try to write it out */
3699                cn = get_journal_hash_dev(sb, journal->j_list_hash_table,
3700                                          blocknr);
3701                while (cn) {
3702                        if (sb == cn->sb && blocknr == cn->blocknr) {
3703                                set_bit(BLOCK_FREED, &cn->state);
3704                                if (cn->bh) {
3705                                        if (!cleaned) {
3706                                                /* remove_from_transaction will brelse the buffer if it was 
3707                                                 ** in the current trans
3708                                                 */
3709                                                clear_buffer_journal_dirty(cn->
3710                                                                           bh);
3711                                                clear_buffer_dirty(cn->bh);
3712                                                clear_buffer_journal_test(cn->
3713                                                                          bh);
3714                                                cleaned = 1;
3715                                                put_bh(cn->bh);
3716                                                if (atomic_read
3717                                                    (&(cn->bh->b_count)) < 0) {
3718                                                        reiserfs_warning(sb,
3719                                                                 "journal-2138",
3720                                                                 "cn->bh->b_count < 0");
3721                                                }
3722                                        }
3723                                        if (cn->jlist) {        /* since we are clearing the bh, we MUST dec nonzerolen */
3724                                                atomic_dec(&
3725                                                           (cn->jlist->
3726                                                            j_nonzerolen));
3727                                        }
3728                                        cn->bh = NULL;
3729                                }
3730                        }
3731                        cn = cn->hnext;
3732                }
3733        }
3734
3735        if (bh)
3736                release_buffer_page(bh); /* get_hash grabs the buffer */
3737        return 0;
3738}
3739
3740void reiserfs_update_inode_transaction(struct inode *inode)
3741{
3742        struct reiserfs_journal *journal = SB_JOURNAL(inode->i_sb);
3743        REISERFS_I(inode)->i_jl = journal->j_current_jl;
3744        REISERFS_I(inode)->i_trans_id = journal->j_trans_id;
3745}
3746
3747/*
3748 * returns -1 on error, 0 if no commits/barriers were done and 1
3749 * if a transaction was actually committed and the barrier was done
3750 */
3751static int __commit_trans_jl(struct inode *inode, unsigned long id,
3752                             struct reiserfs_journal_list *jl)
3753{
3754        struct reiserfs_transaction_handle th;
3755        struct super_block *sb = inode->i_sb;
3756        struct reiserfs_journal *journal = SB_JOURNAL(sb);
3757        int ret = 0;
3758
3759        /* is it from the current transaction, or from an unknown transaction? */
3760        if (id == journal->j_trans_id) {
3761                jl = journal->j_current_jl;
3762                /* try to let other writers come in and grow this transaction */
3763                let_transaction_grow(sb, id);
3764                if (journal->j_trans_id != id) {
3765                        goto flush_commit_only;
3766                }
3767
3768                ret = journal_begin(&th, sb, 1);
3769                if (ret)
3770                        return ret;
3771
3772                /* someone might have ended this transaction while we joined */
3773                if (journal->j_trans_id != id) {
3774                        reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb),
3775                                                     1);
3776                        journal_mark_dirty(&th, sb, SB_BUFFER_WITH_SB(sb));
3777                        ret = journal_end(&th, sb, 1);
3778                        goto flush_commit_only;
3779                }
3780
3781                ret = journal_end_sync(&th, sb, 1);
3782                if (!ret)
3783                        ret = 1;
3784
3785        } else {
3786                /* this gets tricky, we have to make sure the journal list in
3787                 * the inode still exists.  We know the list is still around
3788                 * if we've got a larger transaction id than the oldest list
3789                 */
3790              flush_commit_only:
3791                if (journal_list_still_alive(inode->i_sb, id)) {
3792                        /*
3793                         * we only set ret to 1 when we know for sure
3794                         * the barrier hasn't been started yet on the commit
3795                         * block.
3796                         */
3797                        if (atomic_read(&jl->j_commit_left) > 1)
3798                                ret = 1;
3799                        flush_commit_list(sb, jl, 1);
3800                        if (journal->j_errno)
3801                                ret = journal->j_errno;
3802                }
3803        }
3804        /* otherwise the list is gone, and long since committed */
3805        return ret;
3806}
3807
3808int reiserfs_commit_for_inode(struct inode *inode)
3809{
3810        unsigned int id = REISERFS_I(inode)->i_trans_id;
3811        struct reiserfs_journal_list *jl = REISERFS_I(inode)->i_jl;
3812
3813        /* for the whole inode, assume unset id means it was
3814         * changed in the current transaction.  More conservative
3815         */
3816        if (!id || !jl) {
3817                reiserfs_update_inode_transaction(inode);
3818                id = REISERFS_I(inode)->i_trans_id;
3819                /* jl will be updated in __commit_trans_jl */
3820        }
3821
3822        return __commit_trans_jl(inode, id, jl);
3823}
3824
3825void reiserfs_restore_prepared_buffer(struct super_block *sb,
3826                                      struct buffer_head *bh)
3827{
3828        struct reiserfs_journal *journal = SB_JOURNAL(sb);
3829        PROC_INFO_INC(sb, journal.restore_prepared);
3830        if (!bh) {
3831                return;
3832        }
3833        if (test_clear_buffer_journal_restore_dirty(bh) &&
3834            buffer_journal_dirty(bh)) {
3835                struct reiserfs_journal_cnode *cn;
3836                cn = get_journal_hash_dev(sb,
3837                                          journal->j_list_hash_table,
3838                                          bh->b_blocknr);
3839                if (cn && can_dirty(cn)) {
3840                        set_buffer_journal_test(bh);
3841                        mark_buffer_dirty(bh);
3842                }
3843        }
3844        clear_buffer_journal_prepared(bh);
3845}
3846
3847extern struct tree_balance *cur_tb;
3848/*
3849** before we can change a metadata block, we have to make sure it won't
3850** be written to disk while we are altering it.  So, we must:
3851** clean it
3852** wait on it.
3853**
3854*/
3855int reiserfs_prepare_for_journal(struct super_block *sb,
3856                                 struct buffer_head *bh, int wait)
3857{
3858        PROC_INFO_INC(sb, journal.prepare);
3859
3860        if (!trylock_buffer(bh)) {
3861                if (!wait)
3862                        return 0;
3863                lock_buffer(bh);
3864        }
3865        set_buffer_journal_prepared(bh);
3866        if (test_clear_buffer_dirty(bh) && buffer_journal_dirty(bh)) {
3867                clear_buffer_journal_test(bh);
3868                set_buffer_journal_restore_dirty(bh);
3869        }
3870        unlock_buffer(bh);
3871        return 1;
3872}
3873
3874static void flush_old_journal_lists(struct super_block *s)
3875{
3876        struct reiserfs_journal *journal = SB_JOURNAL(s);
3877        struct reiserfs_journal_list *jl;
3878        struct list_head *entry;
3879        time_t now = get_seconds();
3880
3881        while (!list_empty(&journal->j_journal_list)) {
3882                entry = journal->j_journal_list.next;
3883                jl = JOURNAL_LIST_ENTRY(entry);
3884                /* this check should always be run, to send old lists to disk */
3885                if (jl->j_timestamp < (now - (JOURNAL_MAX_TRANS_AGE * 4)) &&
3886                    atomic_read(&jl->j_commit_left) == 0 &&
3887                    test_transaction(s, jl)) {
3888                        flush_used_journal_lists(s, jl);
3889                } else {
3890                        break;
3891                }
3892        }
3893}
3894
3895/*
3896** long and ugly.  If flush, will not return until all commit
3897** blocks and all real buffers in the trans are on disk.
3898** If no_async, won't return until all commit blocks are on disk.
3899**
3900** keep reading, there are comments as you go along
3901**
3902** If the journal is aborted, we just clean up. Things like flushing
3903** journal lists, etc just won't happen.
3904*/
3905static int do_journal_end(struct reiserfs_transaction_handle *th,
3906                          struct super_block *sb, unsigned long nblocks,
3907                          int flags)
3908{
3909        struct reiserfs_journal *journal = SB_JOURNAL(sb);
3910        struct reiserfs_journal_cnode *cn, *next, *jl_cn;
3911        struct reiserfs_journal_cnode *last_cn = NULL;
3912        struct reiserfs_journal_desc *desc;
3913        struct reiserfs_journal_commit *commit;
3914        struct buffer_head *c_bh;       /* commit bh */
3915        struct buffer_head *d_bh;       /* desc bh */
3916        int cur_write_start = 0;        /* start index of current log write */
3917        int old_start;
3918        int i;
3919        int flush;
3920        int wait_on_commit;
3921        struct reiserfs_journal_list *jl, *temp_jl;
3922        struct list_head *entry, *safe;
3923        unsigned long jindex;
3924        unsigned int commit_trans_id;
3925        int trans_half;
3926
3927        BUG_ON(th->t_refcount > 1);
3928        BUG_ON(!th->t_trans_id);
3929
3930        /* protect flush_older_commits from doing mistakes if the
3931           transaction ID counter gets overflowed.  */
3932        if (th->t_trans_id == ~0U)
3933                flags |= FLUSH_ALL | COMMIT_NOW | WAIT;
3934        flush = flags & FLUSH_ALL;
3935        wait_on_commit = flags & WAIT;
3936
3937        current->journal_info = th->t_handle_save;
3938        reiserfs_check_lock_depth(sb, "journal end");
3939        if (journal->j_len == 0) {
3940                reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb),
3941                                             1);
3942                journal_mark_dirty(th, sb, SB_BUFFER_WITH_SB(sb));
3943        }
3944
3945        lock_journal(sb);
3946        if (journal->j_next_full_flush) {
3947                flags |= FLUSH_ALL;
3948                flush = 1;
3949        }
3950        if (journal->j_next_async_flush) {
3951                flags |= COMMIT_NOW | WAIT;
3952                wait_on_commit = 1;
3953        }
3954
3955        /* check_journal_end locks the journal, and unlocks if it does not return 1
3956         ** it tells us if we should continue with the journal_end, or just return
3957         */
3958        if (!check_journal_end(th, sb, nblocks, flags)) {
3959                sb->s_dirt = 1;
3960                wake_queued_writers(sb);
3961                reiserfs_async_progress_wait(sb);
3962                goto out;
3963        }
3964
3965        /* check_journal_end might set these, check again */
3966        if (journal->j_next_full_flush) {
3967                flush = 1;
3968        }
3969
3970        /*
3971         ** j must wait means we have to flush the log blocks, and the real blocks for
3972         ** this transaction
3973         */
3974        if (journal->j_must_wait > 0) {
3975                flush = 1;
3976        }
3977#ifdef REISERFS_PREALLOCATE
3978        /* quota ops might need to nest, setup the journal_info pointer for them
3979         * and raise the refcount so that it is > 0. */
3980        current->journal_info = th;
3981        th->t_refcount++;
3982        reiserfs_discard_all_prealloc(th);      /* it should not involve new blocks into
3983                                                 * the transaction */
3984        th->t_refcount--;
3985        current->journal_info = th->t_handle_save;
3986#endif
3987
3988        /* setup description block */
3989        d_bh =
3990            journal_getblk(sb,
3991                           SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
3992                           journal->j_start);
3993        set_buffer_uptodate(d_bh);
3994        desc = (struct reiserfs_journal_desc *)(d_bh)->b_data;
3995        memset(d_bh->b_data, 0, d_bh->b_size);
3996        memcpy(get_journal_desc_magic(d_bh), JOURNAL_DESC_MAGIC, 8);
3997        set_desc_trans_id(desc, journal->j_trans_id);
3998
3999        /* setup commit block.  Don't write (keep it clean too) this one until after everyone else is written */
4000        c_bh = journal_getblk(sb, SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
4001                              ((journal->j_start + journal->j_len +
4002                                1) % SB_ONDISK_JOURNAL_SIZE(sb)));
4003        commit = (struct reiserfs_journal_commit *)c_bh->b_data;
4004        memset(c_bh->b_data, 0, c_bh->b_size);
4005        set_commit_trans_id(commit, journal->j_trans_id);
4006        set_buffer_uptodate(c_bh);
4007
4008        /* init this journal list */
4009        jl = journal->j_current_jl;
4010
4011        /* we lock the commit before doing anything because
4012         * we want to make sure nobody tries to run flush_commit_list until
4013         * the new transaction is fully setup, and we've already flushed the
4014         * ordered bh list
4015         */
4016        reiserfs_mutex_lock_safe(&jl->j_commit_mutex, sb);
4017
4018        /* save the transaction id in case we need to commit it later */
4019        commit_trans_id = jl->j_trans_id;
4020
4021        atomic_set(&jl->j_older_commits_done, 0);
4022        jl->j_trans_id = journal->j_trans_id;
4023        jl->j_timestamp = journal->j_trans_start_time;
4024        jl->j_commit_bh = c_bh;
4025        jl->j_start = journal->j_start;
4026        jl->j_len = journal->j_len;
4027        atomic_set(&jl->j_nonzerolen, journal->j_len);
4028        atomic_set(&jl->j_commit_left, journal->j_len + 2);
4029        jl->j_realblock = NULL;
4030
4031        /* The ENTIRE FOR LOOP MUST not cause schedule to occur.
4032         **  for each real block, add it to the journal list hash,
4033         ** copy into real block index array in the commit or desc block
4034         */
4035        trans_half = journal_trans_half(sb->s_blocksize);
4036        for (i = 0, cn = journal->j_first; cn; cn = cn->next, i++) {
4037                if (buffer_journaled(cn->bh)) {
4038                        jl_cn = get_cnode(sb);
4039                        if (!jl_cn) {
4040                                reiserfs_panic(sb, "journal-1676",
4041                                               "get_cnode returned NULL");
4042                        }
4043                        if (i == 0) {
4044                                jl->j_realblock = jl_cn;
4045                        }
4046                        jl_cn->prev = last_cn;
4047                        jl_cn->next = NULL;
4048                        if (last_cn) {
4049                                last_cn->next = jl_cn;
4050                        }
4051                        last_cn = jl_cn;
4052                        /* make sure the block we are trying to log is not a block
4053                           of journal or reserved area */
4054
4055                        if (is_block_in_log_or_reserved_area
4056                            (sb, cn->bh->b_blocknr)) {
4057                                reiserfs_panic(sb, "journal-2332",
4058                                               "Trying to log block %lu, "
4059                                               "which is a log block",
4060                                               cn->bh->b_blocknr);
4061                        }
4062                        jl_cn->blocknr = cn->bh->b_blocknr;
4063                        jl_cn->state = 0;
4064                        jl_cn->sb = sb;
4065                        jl_cn->bh = cn->bh;
4066                        jl_cn->jlist = jl;
4067                        insert_journal_hash(journal->j_list_hash_table, jl_cn);
4068                        if (i < trans_half) {
4069                                desc->j_realblock[i] =
4070                                    cpu_to_le32(cn->bh->b_blocknr);
4071                        } else {
4072                                commit->j_realblock[i - trans_half] =
4073                                    cpu_to_le32(cn->bh->b_blocknr);
4074                        }
4075                } else {
4076                        i--;
4077                }
4078        }
4079        set_desc_trans_len(desc, journal->j_len);
4080        set_desc_mount_id(desc, journal->j_mount_id);
4081        set_desc_trans_id(desc, journal->j_trans_id);
4082        set_commit_trans_len(commit, journal->j_len);
4083
4084        /* special check in case all buffers in the journal were marked for not logging */
4085        BUG_ON(journal->j_len == 0);
4086
4087        /* we're about to dirty all the log blocks, mark the description block
4088         * dirty now too.  Don't mark the commit block dirty until all the
4089         * others are on disk
4090         */
4091        mark_buffer_dirty(d_bh);
4092
4093        /* first data block is j_start + 1, so add one to cur_write_start wherever you use it */
4094        cur_write_start = journal->j_start;
4095        cn = journal->j_first;
4096        jindex = 1;             /* start at one so we don't get the desc again */
4097        while (cn) {
4098                clear_buffer_journal_new(cn->bh);
4099                /* copy all the real blocks into log area.  dirty log blocks */
4100                if (buffer_journaled(cn->bh)) {
4101                        struct buffer_head *tmp_bh;
4102                        char *addr;
4103                        struct page *page;
4104                        tmp_bh =
4105                            journal_getblk(sb,
4106                                           SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
4107                                           ((cur_write_start +
4108                                             jindex) %
4109                                            SB_ONDISK_JOURNAL_SIZE(sb)));
4110                        set_buffer_uptodate(tmp_bh);
4111                        page = cn->bh->b_page;
4112                        addr = kmap(page);
4113                        memcpy(tmp_bh->b_data,
4114                               addr + offset_in_page(cn->bh->b_data),
4115                               cn->bh->b_size);
4116                        kunmap(page);
4117                        mark_buffer_dirty(tmp_bh);
4118                        jindex++;
4119                        set_buffer_journal_dirty(cn->bh);
4120                        clear_buffer_journaled(cn->bh);
4121                } else {
4122                        /* JDirty cleared sometime during transaction.  don't log this one */
4123                        reiserfs_warning(sb, "journal-2048",
4124                                         "BAD, buffer in journal hash, "
4125                                         "but not JDirty!");
4126                        brelse(cn->bh);
4127                }
4128                next = cn->next;
4129                free_cnode(sb, cn);
4130                cn = next;
4131                reiserfs_write_unlock(sb);
4132                cond_resched();
4133                reiserfs_write_lock(sb);
4134        }
4135
4136        /* we are done  with both the c_bh and d_bh, but
4137         ** c_bh must be written after all other commit blocks,
4138         ** so we dirty/relse c_bh in flush_commit_list, with commit_left <= 1.
4139         */
4140
4141        journal->j_current_jl = alloc_journal_list(sb);
4142
4143        /* now it is safe to insert this transaction on the main list */
4144        list_add_tail(&jl->j_list, &journal->j_journal_list);
4145        list_add_tail(&jl->j_working_list, &journal->j_working_list);
4146        journal->j_num_work_lists++;
4147
4148        /* reset journal values for the next transaction */
4149        old_start = journal->j_start;
4150        journal->j_start =
4151            (journal->j_start + journal->j_len +
4152             2) % SB_ONDISK_JOURNAL_SIZE(sb);
4153        atomic_set(&(journal->j_wcount), 0);
4154        journal->j_bcount = 0;
4155        journal->j_last = NULL;
4156        journal->j_first = NULL;
4157        journal->j_len = 0;
4158        journal->j_trans_start_time = 0;
4159        /* check for trans_id overflow */
4160        if (++journal->j_trans_id == 0)
4161                journal->j_trans_id = 10;
4162        journal->j_current_jl->j_trans_id = journal->j_trans_id;
4163        journal->j_must_wait = 0;
4164        journal->j_len_alloc = 0;
4165        journal->j_next_full_flush = 0;
4166        journal->j_next_async_flush = 0;
4167        init_journal_hash(sb);
4168
4169        // make sure reiserfs_add_jh sees the new current_jl before we
4170        // write out the tails
4171        smp_mb();
4172
4173        /* tail conversion targets have to hit the disk before we end the
4174         * transaction.  Otherwise a later transaction might repack the tail
4175         * before this transaction commits, leaving the data block unflushed and
4176         * clean, if we crash before the later transaction commits, the data block
4177         * is lost.
4178         */
4179        if (!list_empty(&jl->j_tail_bh_list)) {
4180                reiserfs_write_unlock(sb);
4181                write_ordered_buffers(&journal->j_dirty_buffers_lock,
4182                                      journal, jl, &jl->j_tail_bh_list);
4183                reiserfs_write_lock(sb);
4184        }
4185        BUG_ON(!list_empty(&jl->j_tail_bh_list));
4186        mutex_unlock(&jl->j_commit_mutex);
4187
4188        /* honor the flush wishes from the caller, simple commits can
4189         ** be done outside the journal lock, they are done below
4190         **
4191         ** if we don't flush the commit list right now, we put it into
4192         ** the work queue so the people waiting on the async progress work
4193         ** queue don't wait for this proc to flush journal lists and such.
4194         */
4195        if (flush) {
4196                flush_commit_list(sb, jl, 1);
4197                flush_journal_list(sb, jl, 1);
4198        } else if (!(jl->j_state & LIST_COMMIT_PENDING))
4199                queue_delayed_work(commit_wq, &journal->j_work, HZ / 10);
4200
4201        /* if the next transaction has any chance of wrapping, flush
4202         ** transactions that might get overwritten.  If any journal lists are very
4203         ** old flush them as well.
4204         */
4205      first_jl:
4206        list_for_each_safe(entry, safe, &journal->j_journal_list) {
4207                temp_jl = JOURNAL_LIST_ENTRY(entry);
4208                if (journal->j_start <= temp_jl->j_start) {
4209                        if ((journal->j_start + journal->j_trans_max + 1) >=
4210                            temp_jl->j_start) {
4211                                flush_used_journal_lists(sb, temp_jl);
4212                                goto first_jl;
4213                        } else if ((journal->j_start +
4214                                    journal->j_trans_max + 1) <
4215                                   SB_ONDISK_JOURNAL_SIZE(sb)) {
4216                                /* if we don't cross into the next transaction and we don't
4217                                 * wrap, there is no way we can overlap any later transactions
4218                                 * break now
4219                                 */
4220                                break;
4221                        }
4222                } else if ((journal->j_start +
4223                            journal->j_trans_max + 1) >
4224                           SB_ONDISK_JOURNAL_SIZE(sb)) {
4225                        if (((journal->j_start + journal->j_trans_max + 1) %
4226                             SB_ONDISK_JOURNAL_SIZE(sb)) >=
4227                            temp_jl->j_start) {
4228                                flush_used_journal_lists(sb, temp_jl);
4229                                goto first_jl;
4230                        } else {
4231                                /* we don't overlap anything from out start to the end of the
4232                                 * log, and our wrapped portion doesn't overlap anything at
4233                                 * the start of the log.  We can break
4234                                 */
4235                                break;
4236                        }
4237                }
4238        }
4239        flush_old_journal_lists(sb);
4240
4241        journal->j_current_jl->j_list_bitmap =
4242            get_list_bitmap(sb, journal->j_current_jl);
4243
4244        if (!(journal->j_current_jl->j_list_bitmap)) {
4245                reiserfs_panic(sb, "journal-1996",
4246                               "could not get a list bitmap");
4247        }
4248
4249        atomic_set(&(journal->j_jlock), 0);
4250        unlock_journal(sb);
4251        /* wake up any body waiting to join. */
4252        clear_bit(J_WRITERS_QUEUED, &journal->j_state);
4253        wake_up(&(journal->j_join_wait));
4254
4255        if (!flush && wait_on_commit &&
4256            journal_list_still_alive(sb, commit_trans_id)) {
4257                flush_commit_list(sb, jl, 1);
4258        }
4259      out:
4260        reiserfs_check_lock_depth(sb, "journal end2");
4261
4262        memset(th, 0, sizeof(*th));
4263        /* Re-set th->t_super, so we can properly keep track of how many
4264         * persistent transactions there are. We need to do this so if this
4265         * call is part of a failed restart_transaction, we can free it later */
4266        th->t_super = sb;
4267
4268        return journal->j_errno;
4269}
4270
4271/* Send the file system read only and refuse new transactions */
4272void reiserfs_abort_journal(struct super_block *sb, int errno)
4273{
4274        struct reiserfs_journal *journal = SB_JOURNAL(sb);
4275        if (test_bit(J_ABORTED, &journal->j_state))
4276                return;
4277
4278        if (!journal->j_errno)
4279                journal->j_errno = errno;
4280
4281        sb->s_flags |= MS_RDONLY;
4282        set_bit(J_ABORTED, &journal->j_state);
4283
4284#ifdef CONFIG_REISERFS_CHECK
4285        dump_stack();
4286#endif
4287}
4288