linux/fs/buffer.c
<<
>>
Prefs
   1/*
   2 *  linux/fs/buffer.c
   3 *
   4 *  Copyright (C) 1991, 1992, 2002  Linus Torvalds
   5 */
   6
   7/*
   8 * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
   9 *
  10 * Removed a lot of unnecessary code and simplified things now that
  11 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
  12 *
  13 * Speed up hash, lru, and free list operations.  Use gfp() for allocating
  14 * hash table, use SLAB cache for buffer heads. SMP threading.  -DaveM
  15 *
  16 * Added 32k buffer block sizes - these are required older ARM systems. - RMK
  17 *
  18 * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
  19 */
  20
  21#include <linux/kernel.h>
  22#include <linux/syscalls.h>
  23#include <linux/fs.h>
  24#include <linux/mm.h>
  25#include <linux/percpu.h>
  26#include <linux/slab.h>
  27#include <linux/capability.h>
  28#include <linux/blkdev.h>
  29#include <linux/file.h>
  30#include <linux/quotaops.h>
  31#include <linux/highmem.h>
  32#include <linux/module.h>
  33#include <linux/writeback.h>
  34#include <linux/hash.h>
  35#include <linux/suspend.h>
  36#include <linux/buffer_head.h>
  37#include <linux/task_io_accounting_ops.h>
  38#include <linux/bio.h>
  39#include <linux/notifier.h>
  40#include <linux/cpu.h>
  41#include <linux/bitops.h>
  42#include <linux/mpage.h>
  43#include <linux/bit_spinlock.h>
  44#include <linux/cleancache.h>
  45
  46static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
  47
  48#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
  49
  50inline void
  51init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
  52{
  53        bh->b_end_io = handler;
  54        bh->b_private = private;
  55}
  56EXPORT_SYMBOL(init_buffer);
  57
  58static int sleep_on_buffer(void *word)
  59{
  60        io_schedule();
  61        return 0;
  62}
  63
  64void __lock_buffer(struct buffer_head *bh)
  65{
  66        wait_on_bit_lock(&bh->b_state, BH_Lock, sleep_on_buffer,
  67                                                        TASK_UNINTERRUPTIBLE);
  68}
  69EXPORT_SYMBOL(__lock_buffer);
  70
  71void unlock_buffer(struct buffer_head *bh)
  72{
  73        clear_bit_unlock(BH_Lock, &bh->b_state);
  74        smp_mb__after_clear_bit();
  75        wake_up_bit(&bh->b_state, BH_Lock);
  76}
  77EXPORT_SYMBOL(unlock_buffer);
  78
  79/*
  80 * Block until a buffer comes unlocked.  This doesn't stop it
  81 * from becoming locked again - you have to lock it yourself
  82 * if you want to preserve its state.
  83 */
  84void __wait_on_buffer(struct buffer_head * bh)
  85{
  86        wait_on_bit(&bh->b_state, BH_Lock, sleep_on_buffer, TASK_UNINTERRUPTIBLE);
  87}
  88EXPORT_SYMBOL(__wait_on_buffer);
  89
  90static void
  91__clear_page_buffers(struct page *page)
  92{
  93        ClearPagePrivate(page);
  94        set_page_private(page, 0);
  95        page_cache_release(page);
  96}
  97
  98
  99static int quiet_error(struct buffer_head *bh)
 100{
 101        if (!test_bit(BH_Quiet, &bh->b_state) && printk_ratelimit())
 102                return 0;
 103        return 1;
 104}
 105
 106
 107static void buffer_io_error(struct buffer_head *bh)
 108{
 109        char b[BDEVNAME_SIZE];
 110        printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
 111                        bdevname(bh->b_bdev, b),
 112                        (unsigned long long)bh->b_blocknr);
 113}
 114
 115/*
 116 * End-of-IO handler helper function which does not touch the bh after
 117 * unlocking it.
 118 * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
 119 * a race there is benign: unlock_buffer() only use the bh's address for
 120 * hashing after unlocking the buffer, so it doesn't actually touch the bh
 121 * itself.
 122 */
 123static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
 124{
 125        if (uptodate) {
 126                set_buffer_uptodate(bh);
 127        } else {
 128                /* This happens, due to failed READA attempts. */
 129                clear_buffer_uptodate(bh);
 130        }
 131        unlock_buffer(bh);
 132}
 133
 134/*
 135 * Default synchronous end-of-IO handler..  Just mark it up-to-date and
 136 * unlock the buffer. This is what ll_rw_block uses too.
 137 */
 138void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
 139{
 140        __end_buffer_read_notouch(bh, uptodate);
 141        put_bh(bh);
 142}
 143EXPORT_SYMBOL(end_buffer_read_sync);
 144
 145void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 146{
 147        char b[BDEVNAME_SIZE];
 148
 149        if (uptodate) {
 150                set_buffer_uptodate(bh);
 151        } else {
 152                if (!quiet_error(bh)) {
 153                        buffer_io_error(bh);
 154                        printk(KERN_WARNING "lost page write due to "
 155                                        "I/O error on %s\n",
 156                                       bdevname(bh->b_bdev, b));
 157                }
 158                set_buffer_write_io_error(bh);
 159                clear_buffer_uptodate(bh);
 160        }
 161        unlock_buffer(bh);
 162        put_bh(bh);
 163}
 164EXPORT_SYMBOL(end_buffer_write_sync);
 165
 166/*
 167 * Various filesystems appear to want __find_get_block to be non-blocking.
 168 * But it's the page lock which protects the buffers.  To get around this,
 169 * we get exclusion from try_to_free_buffers with the blockdev mapping's
 170 * private_lock.
 171 *
 172 * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
 173 * may be quite high.  This code could TryLock the page, and if that
 174 * succeeds, there is no need to take private_lock. (But if
 175 * private_lock is contended then so is mapping->tree_lock).
 176 */
 177static struct buffer_head *
 178__find_get_block_slow(struct block_device *bdev, sector_t block)
 179{
 180        struct inode *bd_inode = bdev->bd_inode;
 181        struct address_space *bd_mapping = bd_inode->i_mapping;
 182        struct buffer_head *ret = NULL;
 183        pgoff_t index;
 184        struct buffer_head *bh;
 185        struct buffer_head *head;
 186        struct page *page;
 187        int all_mapped = 1;
 188
 189        index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
 190        page = find_get_page(bd_mapping, index);
 191        if (!page)
 192                goto out;
 193
 194        spin_lock(&bd_mapping->private_lock);
 195        if (!page_has_buffers(page))
 196                goto out_unlock;
 197        head = page_buffers(page);
 198        bh = head;
 199        do {
 200                if (!buffer_mapped(bh))
 201                        all_mapped = 0;
 202                else if (bh->b_blocknr == block) {
 203                        ret = bh;
 204                        get_bh(bh);
 205                        goto out_unlock;
 206                }
 207                bh = bh->b_this_page;
 208        } while (bh != head);
 209
 210        /* we might be here because some of the buffers on this page are
 211         * not mapped.  This is due to various races between
 212         * file io on the block device and getblk.  It gets dealt with
 213         * elsewhere, don't buffer_error if we had some unmapped buffers
 214         */
 215        if (all_mapped) {
 216                char b[BDEVNAME_SIZE];
 217
 218                printk("__find_get_block_slow() failed. "
 219                        "block=%llu, b_blocknr=%llu\n",
 220                        (unsigned long long)block,
 221                        (unsigned long long)bh->b_blocknr);
 222                printk("b_state=0x%08lx, b_size=%zu\n",
 223                        bh->b_state, bh->b_size);
 224                printk("device %s blocksize: %d\n", bdevname(bdev, b),
 225                        1 << bd_inode->i_blkbits);
 226        }
 227out_unlock:
 228        spin_unlock(&bd_mapping->private_lock);
 229        page_cache_release(page);
 230out:
 231        return ret;
 232}
 233
 234/* If invalidate_buffers() will trash dirty buffers, it means some kind
 235   of fs corruption is going on. Trashing dirty data always imply losing
 236   information that was supposed to be just stored on the physical layer
 237   by the user.
 238
 239   Thus invalidate_buffers in general usage is not allwowed to trash
 240   dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
 241   be preserved.  These buffers are simply skipped.
 242  
 243   We also skip buffers which are still in use.  For example this can
 244   happen if a userspace program is reading the block device.
 245
 246   NOTE: In the case where the user removed a removable-media-disk even if
 247   there's still dirty data not synced on disk (due a bug in the device driver
 248   or due an error of the user), by not destroying the dirty buffers we could
 249   generate corruption also on the next media inserted, thus a parameter is
 250   necessary to handle this case in the most safe way possible (trying
 251   to not corrupt also the new disk inserted with the data belonging to
 252   the old now corrupted disk). Also for the ramdisk the natural thing
 253   to do in order to release the ramdisk memory is to destroy dirty buffers.
 254
 255   These are two special cases. Normal usage imply the device driver
 256   to issue a sync on the device (without waiting I/O completion) and
 257   then an invalidate_buffers call that doesn't trash dirty buffers.
 258
 259   For handling cache coherency with the blkdev pagecache the 'update' case
 260   is been introduced. It is needed to re-read from disk any pinned
 261   buffer. NOTE: re-reading from disk is destructive so we can do it only
 262   when we assume nobody is changing the buffercache under our I/O and when
 263   we think the disk contains more recent information than the buffercache.
 264   The update == 1 pass marks the buffers we need to update, the update == 2
 265   pass does the actual I/O. */
 266void invalidate_bdev(struct block_device *bdev)
 267{
 268        struct address_space *mapping = bdev->bd_inode->i_mapping;
 269
 270        if (mapping->nrpages == 0)
 271                return;
 272
 273        invalidate_bh_lrus();
 274        lru_add_drain_all();    /* make sure all lru add caches are flushed */
 275        invalidate_mapping_pages(mapping, 0, -1);
 276        /* 99% of the time, we don't need to flush the cleancache on the bdev.
 277         * But, for the strange corners, lets be cautious
 278         */
 279        cleancache_flush_inode(mapping);
 280}
 281EXPORT_SYMBOL(invalidate_bdev);
 282
 283/*
 284 * Kick the writeback threads then try to free up some ZONE_NORMAL memory.
 285 */
 286static void free_more_memory(void)
 287{
 288        struct zone *zone;
 289        int nid;
 290
 291        wakeup_flusher_threads(1024, WB_REASON_FREE_MORE_MEM);
 292        yield();
 293
 294        for_each_online_node(nid) {
 295                (void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
 296                                                gfp_zone(GFP_NOFS), NULL,
 297                                                &zone);
 298                if (zone)
 299                        try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
 300                                                GFP_NOFS, NULL);
 301        }
 302}
 303
 304/*
 305 * I/O completion handler for block_read_full_page() - pages
 306 * which come unlocked at the end of I/O.
 307 */
 308static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 309{
 310        unsigned long flags;
 311        struct buffer_head *first;
 312        struct buffer_head *tmp;
 313        struct page *page;
 314        int page_uptodate = 1;
 315
 316        BUG_ON(!buffer_async_read(bh));
 317
 318        page = bh->b_page;
 319        if (uptodate) {
 320                set_buffer_uptodate(bh);
 321        } else {
 322                clear_buffer_uptodate(bh);
 323                if (!quiet_error(bh))
 324                        buffer_io_error(bh);
 325                SetPageError(page);
 326        }
 327
 328        /*
 329         * Be _very_ careful from here on. Bad things can happen if
 330         * two buffer heads end IO at almost the same time and both
 331         * decide that the page is now completely done.
 332         */
 333        first = page_buffers(page);
 334        local_irq_save(flags);
 335        bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
 336        clear_buffer_async_read(bh);
 337        unlock_buffer(bh);
 338        tmp = bh;
 339        do {
 340                if (!buffer_uptodate(tmp))
 341                        page_uptodate = 0;
 342                if (buffer_async_read(tmp)) {
 343                        BUG_ON(!buffer_locked(tmp));
 344                        goto still_busy;
 345                }
 346                tmp = tmp->b_this_page;
 347        } while (tmp != bh);
 348        bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 349        local_irq_restore(flags);
 350
 351        /*
 352         * If none of the buffers had errors and they are all
 353         * uptodate then we can set the page uptodate.
 354         */
 355        if (page_uptodate && !PageError(page))
 356                SetPageUptodate(page);
 357        unlock_page(page);
 358        return;
 359
 360still_busy:
 361        bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 362        local_irq_restore(flags);
 363        return;
 364}
 365
 366/*
 367 * Completion handler for block_write_full_page() - pages which are unlocked
 368 * during I/O, and which have PageWriteback cleared upon I/O completion.
 369 */
 370void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 371{
 372        char b[BDEVNAME_SIZE];
 373        unsigned long flags;
 374        struct buffer_head *first;
 375        struct buffer_head *tmp;
 376        struct page *page;
 377
 378        BUG_ON(!buffer_async_write(bh));
 379
 380        page = bh->b_page;
 381        if (uptodate) {
 382                set_buffer_uptodate(bh);
 383        } else {
 384                if (!quiet_error(bh)) {
 385                        buffer_io_error(bh);
 386                        printk(KERN_WARNING "lost page write due to "
 387                                        "I/O error on %s\n",
 388                               bdevname(bh->b_bdev, b));
 389                }
 390                set_bit(AS_EIO, &page->mapping->flags);
 391                set_buffer_write_io_error(bh);
 392                clear_buffer_uptodate(bh);
 393                SetPageError(page);
 394        }
 395
 396        first = page_buffers(page);
 397        local_irq_save(flags);
 398        bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
 399
 400        clear_buffer_async_write(bh);
 401        unlock_buffer(bh);
 402        tmp = bh->b_this_page;
 403        while (tmp != bh) {
 404                if (buffer_async_write(tmp)) {
 405                        BUG_ON(!buffer_locked(tmp));
 406                        goto still_busy;
 407                }
 408                tmp = tmp->b_this_page;
 409        }
 410        bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 411        local_irq_restore(flags);
 412        end_page_writeback(page);
 413        return;
 414
 415still_busy:
 416        bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 417        local_irq_restore(flags);
 418        return;
 419}
 420EXPORT_SYMBOL(end_buffer_async_write);
 421
 422/*
 423 * If a page's buffers are under async readin (end_buffer_async_read
 424 * completion) then there is a possibility that another thread of
 425 * control could lock one of the buffers after it has completed
 426 * but while some of the other buffers have not completed.  This
 427 * locked buffer would confuse end_buffer_async_read() into not unlocking
 428 * the page.  So the absence of BH_Async_Read tells end_buffer_async_read()
 429 * that this buffer is not under async I/O.
 430 *
 431 * The page comes unlocked when it has no locked buffer_async buffers
 432 * left.
 433 *
 434 * PageLocked prevents anyone starting new async I/O reads any of
 435 * the buffers.
 436 *
 437 * PageWriteback is used to prevent simultaneous writeout of the same
 438 * page.
 439 *
 440 * PageLocked prevents anyone from starting writeback of a page which is
 441 * under read I/O (PageWriteback is only ever set against a locked page).
 442 */
 443static void mark_buffer_async_read(struct buffer_head *bh)
 444{
 445        bh->b_end_io = end_buffer_async_read;
 446        set_buffer_async_read(bh);
 447}
 448
 449static void mark_buffer_async_write_endio(struct buffer_head *bh,
 450                                          bh_end_io_t *handler)
 451{
 452        bh->b_end_io = handler;
 453        set_buffer_async_write(bh);
 454}
 455
 456void mark_buffer_async_write(struct buffer_head *bh)
 457{
 458        mark_buffer_async_write_endio(bh, end_buffer_async_write);
 459}
 460EXPORT_SYMBOL(mark_buffer_async_write);
 461
 462
 463/*
 464 * fs/buffer.c contains helper functions for buffer-backed address space's
 465 * fsync functions.  A common requirement for buffer-based filesystems is
 466 * that certain data from the backing blockdev needs to be written out for
 467 * a successful fsync().  For example, ext2 indirect blocks need to be
 468 * written back and waited upon before fsync() returns.
 469 *
 470 * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
 471 * inode_has_buffers() and invalidate_inode_buffers() are provided for the
 472 * management of a list of dependent buffers at ->i_mapping->private_list.
 473 *
 474 * Locking is a little subtle: try_to_free_buffers() will remove buffers
 475 * from their controlling inode's queue when they are being freed.  But
 476 * try_to_free_buffers() will be operating against the *blockdev* mapping
 477 * at the time, not against the S_ISREG file which depends on those buffers.
 478 * So the locking for private_list is via the private_lock in the address_space
 479 * which backs the buffers.  Which is different from the address_space 
 480 * against which the buffers are listed.  So for a particular address_space,
 481 * mapping->private_lock does *not* protect mapping->private_list!  In fact,
 482 * mapping->private_list will always be protected by the backing blockdev's
 483 * ->private_lock.
 484 *
 485 * Which introduces a requirement: all buffers on an address_space's
 486 * ->private_list must be from the same address_space: the blockdev's.
 487 *
 488 * address_spaces which do not place buffers at ->private_list via these
 489 * utility functions are free to use private_lock and private_list for
 490 * whatever they want.  The only requirement is that list_empty(private_list)
 491 * be true at clear_inode() time.
 492 *
 493 * FIXME: clear_inode should not call invalidate_inode_buffers().  The
 494 * filesystems should do that.  invalidate_inode_buffers() should just go
 495 * BUG_ON(!list_empty).
 496 *
 497 * FIXME: mark_buffer_dirty_inode() is a data-plane operation.  It should
 498 * take an address_space, not an inode.  And it should be called
 499 * mark_buffer_dirty_fsync() to clearly define why those buffers are being
 500 * queued up.
 501 *
 502 * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
 503 * list if it is already on a list.  Because if the buffer is on a list,
 504 * it *must* already be on the right one.  If not, the filesystem is being
 505 * silly.  This will save a ton of locking.  But first we have to ensure
 506 * that buffers are taken *off* the old inode's list when they are freed
 507 * (presumably in truncate).  That requires careful auditing of all
 508 * filesystems (do it inside bforget()).  It could also be done by bringing
 509 * b_inode back.
 510 */
 511
 512/*
 513 * The buffer's backing address_space's private_lock must be held
 514 */
 515static void __remove_assoc_queue(struct buffer_head *bh)
 516{
 517        list_del_init(&bh->b_assoc_buffers);
 518        WARN_ON(!bh->b_assoc_map);
 519        if (buffer_write_io_error(bh))
 520                set_bit(AS_EIO, &bh->b_assoc_map->flags);
 521        bh->b_assoc_map = NULL;
 522}
 523
 524int inode_has_buffers(struct inode *inode)
 525{
 526        return !list_empty(&inode->i_data.private_list);
 527}
 528
 529/*
 530 * osync is designed to support O_SYNC io.  It waits synchronously for
 531 * all already-submitted IO to complete, but does not queue any new
 532 * writes to the disk.
 533 *
 534 * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
 535 * you dirty the buffers, and then use osync_inode_buffers to wait for
 536 * completion.  Any other dirty buffers which are not yet queued for
 537 * write will not be flushed to disk by the osync.
 538 */
 539static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
 540{
 541        struct buffer_head *bh;
 542        struct list_head *p;
 543        int err = 0;
 544
 545        spin_lock(lock);
 546repeat:
 547        list_for_each_prev(p, list) {
 548                bh = BH_ENTRY(p);
 549                if (buffer_locked(bh)) {
 550                        get_bh(bh);
 551                        spin_unlock(lock);
 552                        wait_on_buffer(bh);
 553                        if (!buffer_uptodate(bh))
 554                                err = -EIO;
 555                        brelse(bh);
 556                        spin_lock(lock);
 557                        goto repeat;
 558                }
 559        }
 560        spin_unlock(lock);
 561        return err;
 562}
 563
 564static void do_thaw_one(struct super_block *sb, void *unused)
 565{
 566        char b[BDEVNAME_SIZE];
 567        while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
 568                printk(KERN_WARNING "Emergency Thaw on %s\n",
 569                       bdevname(sb->s_bdev, b));
 570}
 571
 572static void do_thaw_all(struct work_struct *work)
 573{
 574        iterate_supers(do_thaw_one, NULL);
 575        kfree(work);
 576        printk(KERN_WARNING "Emergency Thaw complete\n");
 577}
 578
 579/**
 580 * emergency_thaw_all -- forcibly thaw every frozen filesystem
 581 *
 582 * Used for emergency unfreeze of all filesystems via SysRq
 583 */
 584void emergency_thaw_all(void)
 585{
 586        struct work_struct *work;
 587
 588        work = kmalloc(sizeof(*work), GFP_ATOMIC);
 589        if (work) {
 590                INIT_WORK(work, do_thaw_all);
 591                schedule_work(work);
 592        }
 593}
 594
 595/**
 596 * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
 597 * @mapping: the mapping which wants those buffers written
 598 *
 599 * Starts I/O against the buffers at mapping->private_list, and waits upon
 600 * that I/O.
 601 *
 602 * Basically, this is a convenience function for fsync().
 603 * @mapping is a file or directory which needs those buffers to be written for
 604 * a successful fsync().
 605 */
 606int sync_mapping_buffers(struct address_space *mapping)
 607{
 608        struct address_space *buffer_mapping = mapping->assoc_mapping;
 609
 610        if (buffer_mapping == NULL || list_empty(&mapping->private_list))
 611                return 0;
 612
 613        return fsync_buffers_list(&buffer_mapping->private_lock,
 614                                        &mapping->private_list);
 615}
 616EXPORT_SYMBOL(sync_mapping_buffers);
 617
 618/*
 619 * Called when we've recently written block `bblock', and it is known that
 620 * `bblock' was for a buffer_boundary() buffer.  This means that the block at
 621 * `bblock + 1' is probably a dirty indirect block.  Hunt it down and, if it's
 622 * dirty, schedule it for IO.  So that indirects merge nicely with their data.
 623 */
 624void write_boundary_block(struct block_device *bdev,
 625                        sector_t bblock, unsigned blocksize)
 626{
 627        struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
 628        if (bh) {
 629                if (buffer_dirty(bh))
 630                        ll_rw_block(WRITE, 1, &bh);
 631                put_bh(bh);
 632        }
 633}
 634
 635void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
 636{
 637        struct address_space *mapping = inode->i_mapping;
 638        struct address_space *buffer_mapping = bh->b_page->mapping;
 639
 640        mark_buffer_dirty(bh);
 641        if (!mapping->assoc_mapping) {
 642                mapping->assoc_mapping = buffer_mapping;
 643        } else {
 644                BUG_ON(mapping->assoc_mapping != buffer_mapping);
 645        }
 646        if (!bh->b_assoc_map) {
 647                spin_lock(&buffer_mapping->private_lock);
 648                list_move_tail(&bh->b_assoc_buffers,
 649                                &mapping->private_list);
 650                bh->b_assoc_map = mapping;
 651                spin_unlock(&buffer_mapping->private_lock);
 652        }
 653}
 654EXPORT_SYMBOL(mark_buffer_dirty_inode);
 655
 656/*
 657 * Mark the page dirty, and set it dirty in the radix tree, and mark the inode
 658 * dirty.
 659 *
 660 * If warn is true, then emit a warning if the page is not uptodate and has
 661 * not been truncated.
 662 */
 663static void __set_page_dirty(struct page *page,
 664                struct address_space *mapping, int warn)
 665{
 666        spin_lock_irq(&mapping->tree_lock);
 667        if (page->mapping) {    /* Race with truncate? */
 668                WARN_ON_ONCE(warn && !PageUptodate(page));
 669                account_page_dirtied(page, mapping);
 670                radix_tree_tag_set(&mapping->page_tree,
 671                                page_index(page), PAGECACHE_TAG_DIRTY);
 672        }
 673        spin_unlock_irq(&mapping->tree_lock);
 674        __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 675}
 676
 677/*
 678 * Add a page to the dirty page list.
 679 *
 680 * It is a sad fact of life that this function is called from several places
 681 * deeply under spinlocking.  It may not sleep.
 682 *
 683 * If the page has buffers, the uptodate buffers are set dirty, to preserve
 684 * dirty-state coherency between the page and the buffers.  It the page does
 685 * not have buffers then when they are later attached they will all be set
 686 * dirty.
 687 *
 688 * The buffers are dirtied before the page is dirtied.  There's a small race
 689 * window in which a writepage caller may see the page cleanness but not the
 690 * buffer dirtiness.  That's fine.  If this code were to set the page dirty
 691 * before the buffers, a concurrent writepage caller could clear the page dirty
 692 * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
 693 * page on the dirty page list.
 694 *
 695 * We use private_lock to lock against try_to_free_buffers while using the
 696 * page's buffer list.  Also use this to protect against clean buffers being
 697 * added to the page after it was set dirty.
 698 *
 699 * FIXME: may need to call ->reservepage here as well.  That's rather up to the
 700 * address_space though.
 701 */
 702int __set_page_dirty_buffers(struct page *page)
 703{
 704        int newly_dirty;
 705        struct address_space *mapping = page_mapping(page);
 706
 707        if (unlikely(!mapping))
 708                return !TestSetPageDirty(page);
 709
 710        spin_lock(&mapping->private_lock);
 711        if (page_has_buffers(page)) {
 712                struct buffer_head *head = page_buffers(page);
 713                struct buffer_head *bh = head;
 714
 715                do {
 716                        set_buffer_dirty(bh);
 717                        bh = bh->b_this_page;
 718                } while (bh != head);
 719        }
 720        newly_dirty = !TestSetPageDirty(page);
 721        spin_unlock(&mapping->private_lock);
 722
 723        if (newly_dirty)
 724                __set_page_dirty(page, mapping, 1);
 725        return newly_dirty;
 726}
 727EXPORT_SYMBOL(__set_page_dirty_buffers);
 728
 729/*
 730 * Write out and wait upon a list of buffers.
 731 *
 732 * We have conflicting pressures: we want to make sure that all
 733 * initially dirty buffers get waited on, but that any subsequently
 734 * dirtied buffers don't.  After all, we don't want fsync to last
 735 * forever if somebody is actively writing to the file.
 736 *
 737 * Do this in two main stages: first we copy dirty buffers to a
 738 * temporary inode list, queueing the writes as we go.  Then we clean
 739 * up, waiting for those writes to complete.
 740 * 
 741 * During this second stage, any subsequent updates to the file may end
 742 * up refiling the buffer on the original inode's dirty list again, so
 743 * there is a chance we will end up with a buffer queued for write but
 744 * not yet completed on that list.  So, as a final cleanup we go through
 745 * the osync code to catch these locked, dirty buffers without requeuing
 746 * any newly dirty buffers for write.
 747 */
 748static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 749{
 750        struct buffer_head *bh;
 751        struct list_head tmp;
 752        struct address_space *mapping;
 753        int err = 0, err2;
 754        struct blk_plug plug;
 755
 756        INIT_LIST_HEAD(&tmp);
 757        blk_start_plug(&plug);
 758
 759        spin_lock(lock);
 760        while (!list_empty(list)) {
 761                bh = BH_ENTRY(list->next);
 762                mapping = bh->b_assoc_map;
 763                __remove_assoc_queue(bh);
 764                /* Avoid race with mark_buffer_dirty_inode() which does
 765                 * a lockless check and we rely on seeing the dirty bit */
 766                smp_mb();
 767                if (buffer_dirty(bh) || buffer_locked(bh)) {
 768                        list_add(&bh->b_assoc_buffers, &tmp);
 769                        bh->b_assoc_map = mapping;
 770                        if (buffer_dirty(bh)) {
 771                                get_bh(bh);
 772                                spin_unlock(lock);
 773                                /*
 774                                 * Ensure any pending I/O completes so that
 775                                 * write_dirty_buffer() actually writes the
 776                                 * current contents - it is a noop if I/O is
 777                                 * still in flight on potentially older
 778                                 * contents.
 779                                 */
 780                                write_dirty_buffer(bh, WRITE_SYNC);
 781
 782                                /*
 783                                 * Kick off IO for the previous mapping. Note
 784                                 * that we will not run the very last mapping,
 785                                 * wait_on_buffer() will do that for us
 786                                 * through sync_buffer().
 787                                 */
 788                                brelse(bh);
 789                                spin_lock(lock);
 790                        }
 791                }
 792        }
 793
 794        spin_unlock(lock);
 795        blk_finish_plug(&plug);
 796        spin_lock(lock);
 797
 798        while (!list_empty(&tmp)) {
 799                bh = BH_ENTRY(tmp.prev);
 800                get_bh(bh);
 801                mapping = bh->b_assoc_map;
 802                __remove_assoc_queue(bh);
 803                /* Avoid race with mark_buffer_dirty_inode() which does
 804                 * a lockless check and we rely on seeing the dirty bit */
 805                smp_mb();
 806                if (buffer_dirty(bh)) {
 807                        list_add(&bh->b_assoc_buffers,
 808                                 &mapping->private_list);
 809                        bh->b_assoc_map = mapping;
 810                }
 811                spin_unlock(lock);
 812                wait_on_buffer(bh);
 813                if (!buffer_uptodate(bh))
 814                        err = -EIO;
 815                brelse(bh);
 816                spin_lock(lock);
 817        }
 818        
 819        spin_unlock(lock);
 820        err2 = osync_buffers_list(lock, list);
 821        if (err)
 822                return err;
 823        else
 824                return err2;
 825}
 826
 827/*
 828 * Invalidate any and all dirty buffers on a given inode.  We are
 829 * probably unmounting the fs, but that doesn't mean we have already
 830 * done a sync().  Just drop the buffers from the inode list.
 831 *
 832 * NOTE: we take the inode's blockdev's mapping's private_lock.  Which
 833 * assumes that all the buffers are against the blockdev.  Not true
 834 * for reiserfs.
 835 */
 836void invalidate_inode_buffers(struct inode *inode)
 837{
 838        if (inode_has_buffers(inode)) {
 839                struct address_space *mapping = &inode->i_data;
 840                struct list_head *list = &mapping->private_list;
 841                struct address_space *buffer_mapping = mapping->assoc_mapping;
 842
 843                spin_lock(&buffer_mapping->private_lock);
 844                while (!list_empty(list))
 845                        __remove_assoc_queue(BH_ENTRY(list->next));
 846                spin_unlock(&buffer_mapping->private_lock);
 847        }
 848}
 849EXPORT_SYMBOL(invalidate_inode_buffers);
 850
 851/*
 852 * Remove any clean buffers from the inode's buffer list.  This is called
 853 * when we're trying to free the inode itself.  Those buffers can pin it.
 854 *
 855 * Returns true if all buffers were removed.
 856 */
 857int remove_inode_buffers(struct inode *inode)
 858{
 859        int ret = 1;
 860
 861        if (inode_has_buffers(inode)) {
 862                struct address_space *mapping = &inode->i_data;
 863                struct list_head *list = &mapping->private_list;
 864                struct address_space *buffer_mapping = mapping->assoc_mapping;
 865
 866                spin_lock(&buffer_mapping->private_lock);
 867                while (!list_empty(list)) {
 868                        struct buffer_head *bh = BH_ENTRY(list->next);
 869                        if (buffer_dirty(bh)) {
 870                                ret = 0;
 871                                break;
 872                        }
 873                        __remove_assoc_queue(bh);
 874                }
 875                spin_unlock(&buffer_mapping->private_lock);
 876        }
 877        return ret;
 878}
 879
 880/*
 881 * Create the appropriate buffers when given a page for data area and
 882 * the size of each buffer.. Use the bh->b_this_page linked list to
 883 * follow the buffers created.  Return NULL if unable to create more
 884 * buffers.
 885 *
 886 * The retry flag is used to differentiate async IO (paging, swapping)
 887 * which may not fail from ordinary buffer allocations.
 888 */
 889struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
 890                int retry)
 891{
 892        struct buffer_head *bh, *head;
 893        long offset;
 894
 895try_again:
 896        head = NULL;
 897        offset = PAGE_SIZE;
 898        while ((offset -= size) >= 0) {
 899                bh = alloc_buffer_head(GFP_NOFS);
 900                if (!bh)
 901                        goto no_grow;
 902
 903                bh->b_bdev = NULL;
 904                bh->b_this_page = head;
 905                bh->b_blocknr = -1;
 906                head = bh;
 907
 908                bh->b_state = 0;
 909                atomic_set(&bh->b_count, 0);
 910                bh->b_size = size;
 911
 912                /* Link the buffer to its page */
 913                set_bh_page(bh, page, offset);
 914
 915                init_buffer(bh, NULL, NULL);
 916        }
 917        return head;
 918/*
 919 * In case anything failed, we just free everything we got.
 920 */
 921no_grow:
 922        if (head) {
 923                do {
 924                        bh = head;
 925                        head = head->b_this_page;
 926                        free_buffer_head(bh);
 927                } while (head);
 928        }
 929
 930        /*
 931         * Return failure for non-async IO requests.  Async IO requests
 932         * are not allowed to fail, so we have to wait until buffer heads
 933         * become available.  But we don't want tasks sleeping with 
 934         * partially complete buffers, so all were released above.
 935         */
 936        if (!retry)
 937                return NULL;
 938
 939        /* We're _really_ low on memory. Now we just
 940         * wait for old buffer heads to become free due to
 941         * finishing IO.  Since this is an async request and
 942         * the reserve list is empty, we're sure there are 
 943         * async buffer heads in use.
 944         */
 945        free_more_memory();
 946        goto try_again;
 947}
 948EXPORT_SYMBOL_GPL(alloc_page_buffers);
 949
 950static inline void
 951link_dev_buffers(struct page *page, struct buffer_head *head)
 952{
 953        struct buffer_head *bh, *tail;
 954
 955        bh = head;
 956        do {
 957                tail = bh;
 958                bh = bh->b_this_page;
 959        } while (bh);
 960        tail->b_this_page = head;
 961        attach_page_buffers(page, head);
 962}
 963
 964/*
 965 * Initialise the state of a blockdev page's buffers.
 966 */ 
 967static void
 968init_page_buffers(struct page *page, struct block_device *bdev,
 969                        sector_t block, int size)
 970{
 971        struct buffer_head *head = page_buffers(page);
 972        struct buffer_head *bh = head;
 973        int uptodate = PageUptodate(page);
 974
 975        do {
 976                if (!buffer_mapped(bh)) {
 977                        init_buffer(bh, NULL, NULL);
 978                        bh->b_bdev = bdev;
 979                        bh->b_blocknr = block;
 980                        if (uptodate)
 981                                set_buffer_uptodate(bh);
 982                        set_buffer_mapped(bh);
 983                }
 984                block++;
 985                bh = bh->b_this_page;
 986        } while (bh != head);
 987}
 988
 989/*
 990 * Create the page-cache page that contains the requested block.
 991 *
 992 * This is user purely for blockdev mappings.
 993 */
 994static struct page *
 995grow_dev_page(struct block_device *bdev, sector_t block,
 996                pgoff_t index, int size)
 997{
 998        struct inode *inode = bdev->bd_inode;
 999        struct page *page;
1000        struct buffer_head *bh;
1001
1002        page = find_or_create_page(inode->i_mapping, index,
1003                (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
1004        if (!page)
1005                return NULL;
1006
1007        BUG_ON(!PageLocked(page));
1008
1009        if (page_has_buffers(page)) {
1010                bh = page_buffers(page);
1011                if (bh->b_size == size) {
1012                        init_page_buffers(page, bdev, block, size);
1013                        return page;
1014                }
1015                if (!try_to_free_buffers(page))
1016                        goto failed;
1017        }
1018
1019        /*
1020         * Allocate some buffers for this page
1021         */
1022        bh = alloc_page_buffers(page, size, 0);
1023        if (!bh)
1024                goto failed;
1025
1026        /*
1027         * Link the page to the buffers and initialise them.  Take the
1028         * lock to be atomic wrt __find_get_block(), which does not
1029         * run under the page lock.
1030         */
1031        spin_lock(&inode->i_mapping->private_lock);
1032        link_dev_buffers(page, bh);
1033        init_page_buffers(page, bdev, block, size);
1034        spin_unlock(&inode->i_mapping->private_lock);
1035        return page;
1036
1037failed:
1038        BUG();
1039        unlock_page(page);
1040        page_cache_release(page);
1041        return NULL;
1042}
1043
1044/*
1045 * Create buffers for the specified block device block's page.  If
1046 * that page was dirty, the buffers are set dirty also.
1047 */
1048static int
1049grow_buffers(struct block_device *bdev, sector_t block, int size)
1050{
1051        struct page *page;
1052        pgoff_t index;
1053        int sizebits;
1054
1055        sizebits = -1;
1056        do {
1057                sizebits++;
1058        } while ((size << sizebits) < PAGE_SIZE);
1059
1060        index = block >> sizebits;
1061
1062        /*
1063         * Check for a block which wants to lie outside our maximum possible
1064         * pagecache index.  (this comparison is done using sector_t types).
1065         */
1066        if (unlikely(index != block >> sizebits)) {
1067                char b[BDEVNAME_SIZE];
1068
1069                printk(KERN_ERR "%s: requested out-of-range block %llu for "
1070                        "device %s\n",
1071                        __func__, (unsigned long long)block,
1072                        bdevname(bdev, b));
1073                return -EIO;
1074        }
1075        block = index << sizebits;
1076        /* Create a page with the proper size buffers.. */
1077        page = grow_dev_page(bdev, block, index, size);
1078        if (!page)
1079                return 0;
1080        unlock_page(page);
1081        page_cache_release(page);
1082        return 1;
1083}
1084
1085static struct buffer_head *
1086__getblk_slow(struct block_device *bdev, sector_t block, int size)
1087{
1088        /* Size must be multiple of hard sectorsize */
1089        if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
1090                        (size < 512 || size > PAGE_SIZE))) {
1091                printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1092                                        size);
1093                printk(KERN_ERR "logical block size: %d\n",
1094                                        bdev_logical_block_size(bdev));
1095
1096                dump_stack();
1097                return NULL;
1098        }
1099
1100        for (;;) {
1101                struct buffer_head * bh;
1102                int ret;
1103
1104                bh = __find_get_block(bdev, block, size);
1105                if (bh)
1106                        return bh;
1107
1108                ret = grow_buffers(bdev, block, size);
1109                if (ret < 0)
1110                        return NULL;
1111                if (ret == 0)
1112                        free_more_memory();
1113        }
1114}
1115
1116/*
1117 * The relationship between dirty buffers and dirty pages:
1118 *
1119 * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1120 * the page is tagged dirty in its radix tree.
1121 *
1122 * At all times, the dirtiness of the buffers represents the dirtiness of
1123 * subsections of the page.  If the page has buffers, the page dirty bit is
1124 * merely a hint about the true dirty state.
1125 *
1126 * When a page is set dirty in its entirety, all its buffers are marked dirty
1127 * (if the page has buffers).
1128 *
1129 * When a buffer is marked dirty, its page is dirtied, but the page's other
1130 * buffers are not.
1131 *
1132 * Also.  When blockdev buffers are explicitly read with bread(), they
1133 * individually become uptodate.  But their backing page remains not
1134 * uptodate - even if all of its buffers are uptodate.  A subsequent
1135 * block_read_full_page() against that page will discover all the uptodate
1136 * buffers, will set the page uptodate and will perform no I/O.
1137 */
1138
1139/**
1140 * mark_buffer_dirty - mark a buffer_head as needing writeout
1141 * @bh: the buffer_head to mark dirty
1142 *
1143 * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
1144 * backing page dirty, then tag the page as dirty in its address_space's radix
1145 * tree and then attach the address_space's inode to its superblock's dirty
1146 * inode list.
1147 *
1148 * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
1149 * mapping->tree_lock and mapping->host->i_lock.
1150 */
1151void mark_buffer_dirty(struct buffer_head *bh)
1152{
1153        WARN_ON_ONCE(!buffer_uptodate(bh));
1154
1155        /*
1156         * Very *carefully* optimize the it-is-already-dirty case.
1157         *
1158         * Don't let the final "is it dirty" escape to before we
1159         * perhaps modified the buffer.
1160         */
1161        if (buffer_dirty(bh)) {
1162                smp_mb();
1163                if (buffer_dirty(bh))
1164                        return;
1165        }
1166
1167        if (!test_set_buffer_dirty(bh)) {
1168                struct page *page = bh->b_page;
1169                if (!TestSetPageDirty(page)) {
1170                        struct address_space *mapping = page_mapping(page);
1171                        if (mapping)
1172                                __set_page_dirty(page, mapping, 0);
1173                }
1174        }
1175}
1176EXPORT_SYMBOL(mark_buffer_dirty);
1177
1178/*
1179 * Decrement a buffer_head's reference count.  If all buffers against a page
1180 * have zero reference count, are clean and unlocked, and if the page is clean
1181 * and unlocked then try_to_free_buffers() may strip the buffers from the page
1182 * in preparation for freeing it (sometimes, rarely, buffers are removed from
1183 * a page but it ends up not being freed, and buffers may later be reattached).
1184 */
1185void __brelse(struct buffer_head * buf)
1186{
1187        if (atomic_read(&buf->b_count)) {
1188                put_bh(buf);
1189                return;
1190        }
1191        WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1192}
1193EXPORT_SYMBOL(__brelse);
1194
1195/*
1196 * bforget() is like brelse(), except it discards any
1197 * potentially dirty data.
1198 */
1199void __bforget(struct buffer_head *bh)
1200{
1201        clear_buffer_dirty(bh);
1202        if (bh->b_assoc_map) {
1203                struct address_space *buffer_mapping = bh->b_page->mapping;
1204
1205                spin_lock(&buffer_mapping->private_lock);
1206                list_del_init(&bh->b_assoc_buffers);
1207                bh->b_assoc_map = NULL;
1208                spin_unlock(&buffer_mapping->private_lock);
1209        }
1210        __brelse(bh);
1211}
1212EXPORT_SYMBOL(__bforget);
1213
1214static struct buffer_head *__bread_slow(struct buffer_head *bh)
1215{
1216        lock_buffer(bh);
1217        if (buffer_uptodate(bh)) {
1218                unlock_buffer(bh);
1219                return bh;
1220        } else {
1221                get_bh(bh);
1222                bh->b_end_io = end_buffer_read_sync;
1223                submit_bh(READ, bh);
1224                wait_on_buffer(bh);
1225                if (buffer_uptodate(bh))
1226                        return bh;
1227        }
1228        brelse(bh);
1229        return NULL;
1230}
1231
1232/*
1233 * Per-cpu buffer LRU implementation.  To reduce the cost of __find_get_block().
1234 * The bhs[] array is sorted - newest buffer is at bhs[0].  Buffers have their
1235 * refcount elevated by one when they're in an LRU.  A buffer can only appear
1236 * once in a particular CPU's LRU.  A single buffer can be present in multiple
1237 * CPU's LRUs at the same time.
1238 *
1239 * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1240 * sb_find_get_block().
1241 *
1242 * The LRUs themselves only need locking against invalidate_bh_lrus.  We use
1243 * a local interrupt disable for that.
1244 */
1245
1246#define BH_LRU_SIZE     8
1247
1248struct bh_lru {
1249        struct buffer_head *bhs[BH_LRU_SIZE];
1250};
1251
1252static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1253
1254#ifdef CONFIG_SMP
1255#define bh_lru_lock()   local_irq_disable()
1256#define bh_lru_unlock() local_irq_enable()
1257#else
1258#define bh_lru_lock()   preempt_disable()
1259#define bh_lru_unlock() preempt_enable()
1260#endif
1261
1262static inline void check_irqs_on(void)
1263{
1264#ifdef irqs_disabled
1265        BUG_ON(irqs_disabled());
1266#endif
1267}
1268
1269/*
1270 * The LRU management algorithm is dopey-but-simple.  Sorry.
1271 */
1272static void bh_lru_install(struct buffer_head *bh)
1273{
1274        struct buffer_head *evictee = NULL;
1275
1276        check_irqs_on();
1277        bh_lru_lock();
1278        if (__this_cpu_read(bh_lrus.bhs[0]) != bh) {
1279                struct buffer_head *bhs[BH_LRU_SIZE];
1280                int in;
1281                int out = 0;
1282
1283                get_bh(bh);
1284                bhs[out++] = bh;
1285                for (in = 0; in < BH_LRU_SIZE; in++) {
1286                        struct buffer_head *bh2 =
1287                                __this_cpu_read(bh_lrus.bhs[in]);
1288
1289                        if (bh2 == bh) {
1290                                __brelse(bh2);
1291                        } else {
1292                                if (out >= BH_LRU_SIZE) {
1293                                        BUG_ON(evictee != NULL);
1294                                        evictee = bh2;
1295                                } else {
1296                                        bhs[out++] = bh2;
1297                                }
1298                        }
1299                }
1300                while (out < BH_LRU_SIZE)
1301                        bhs[out++] = NULL;
1302                memcpy(__this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs));
1303        }
1304        bh_lru_unlock();
1305
1306        if (evictee)
1307                __brelse(evictee);
1308}
1309
1310/*
1311 * Look up the bh in this cpu's LRU.  If it's there, move it to the head.
1312 */
1313static struct buffer_head *
1314lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
1315{
1316        struct buffer_head *ret = NULL;
1317        unsigned int i;
1318
1319        check_irqs_on();
1320        bh_lru_lock();
1321        for (i = 0; i < BH_LRU_SIZE; i++) {
1322                struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
1323
1324                if (bh && bh->b_bdev == bdev &&
1325                                bh->b_blocknr == block && bh->b_size == size) {
1326                        if (i) {
1327                                while (i) {
1328                                        __this_cpu_write(bh_lrus.bhs[i],
1329                                                __this_cpu_read(bh_lrus.bhs[i - 1]));
1330                                        i--;
1331                                }
1332                                __this_cpu_write(bh_lrus.bhs[0], bh);
1333                        }
1334                        get_bh(bh);
1335                        ret = bh;
1336                        break;
1337                }
1338        }
1339        bh_lru_unlock();
1340        return ret;
1341}
1342
1343/*
1344 * Perform a pagecache lookup for the matching buffer.  If it's there, refresh
1345 * it in the LRU and mark it as accessed.  If it is not present then return
1346 * NULL
1347 */
1348struct buffer_head *
1349__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
1350{
1351        struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1352
1353        if (bh == NULL) {
1354                bh = __find_get_block_slow(bdev, block);
1355                if (bh)
1356                        bh_lru_install(bh);
1357        }
1358        if (bh)
1359                touch_buffer(bh);
1360        return bh;
1361}
1362EXPORT_SYMBOL(__find_get_block);
1363
1364/*
1365 * __getblk will locate (and, if necessary, create) the buffer_head
1366 * which corresponds to the passed block_device, block and size. The
1367 * returned buffer has its reference count incremented.
1368 *
1369 * __getblk() cannot fail - it just keeps trying.  If you pass it an
1370 * illegal block number, __getblk() will happily return a buffer_head
1371 * which represents the non-existent block.  Very weird.
1372 *
1373 * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
1374 * attempt is failing.  FIXME, perhaps?
1375 */
1376struct buffer_head *
1377__getblk(struct block_device *bdev, sector_t block, unsigned size)
1378{
1379        struct buffer_head *bh = __find_get_block(bdev, block, size);
1380
1381        might_sleep();
1382        if (bh == NULL)
1383                bh = __getblk_slow(bdev, block, size);
1384        return bh;
1385}
1386EXPORT_SYMBOL(__getblk);
1387
1388/*
1389 * Do async read-ahead on a buffer..
1390 */
1391void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
1392{
1393        struct buffer_head *bh = __getblk(bdev, block, size);
1394        if (likely(bh)) {
1395                ll_rw_block(READA, 1, &bh);
1396                brelse(bh);
1397        }
1398}
1399EXPORT_SYMBOL(__breadahead);
1400
1401/**
1402 *  __bread() - reads a specified block and returns the bh
1403 *  @bdev: the block_device to read from
1404 *  @block: number of block
1405 *  @size: size (in bytes) to read
1406 * 
1407 *  Reads a specified block, and returns buffer head that contains it.
1408 *  It returns NULL if the block was unreadable.
1409 */
1410struct buffer_head *
1411__bread(struct block_device *bdev, sector_t block, unsigned size)
1412{
1413        struct buffer_head *bh = __getblk(bdev, block, size);
1414
1415        if (likely(bh) && !buffer_uptodate(bh))
1416                bh = __bread_slow(bh);
1417        return bh;
1418}
1419EXPORT_SYMBOL(__bread);
1420
1421/*
1422 * invalidate_bh_lrus() is called rarely - but not only at unmount.
1423 * This doesn't race because it runs in each cpu either in irq
1424 * or with preempt disabled.
1425 */
1426static void invalidate_bh_lru(void *arg)
1427{
1428        struct bh_lru *b = &get_cpu_var(bh_lrus);
1429        int i;
1430
1431        for (i = 0; i < BH_LRU_SIZE; i++) {
1432                brelse(b->bhs[i]);
1433                b->bhs[i] = NULL;
1434        }
1435        put_cpu_var(bh_lrus);
1436}
1437        
1438void invalidate_bh_lrus(void)
1439{
1440        on_each_cpu(invalidate_bh_lru, NULL, 1);
1441}
1442EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
1443
1444void set_bh_page(struct buffer_head *bh,
1445                struct page *page, unsigned long offset)
1446{
1447        bh->b_page = page;
1448        BUG_ON(offset >= PAGE_SIZE);
1449        if (PageHighMem(page))
1450                /*
1451                 * This catches illegal uses and preserves the offset:
1452                 */
1453                bh->b_data = (char *)(0 + offset);
1454        else
1455                bh->b_data = page_address(page) + offset;
1456}
1457EXPORT_SYMBOL(set_bh_page);
1458
1459/*
1460 * Called when truncating a buffer on a page completely.
1461 */
1462static void discard_buffer(struct buffer_head * bh)
1463{
1464        lock_buffer(bh);
1465        clear_buffer_dirty(bh);
1466        bh->b_bdev = NULL;
1467        clear_buffer_mapped(bh);
1468        clear_buffer_req(bh);
1469        clear_buffer_new(bh);
1470        clear_buffer_delay(bh);
1471        clear_buffer_unwritten(bh);
1472        unlock_buffer(bh);
1473}
1474
1475/**
1476 * block_invalidatepage - invalidate part or all of a buffer-backed page
1477 *
1478 * @page: the page which is affected
1479 * @offset: the index of the truncation point
1480 *
1481 * block_invalidatepage() is called when all or part of the page has become
1482 * invalidated by a truncate operation.
1483 *
1484 * block_invalidatepage() does not have to release all buffers, but it must
1485 * ensure that no dirty buffer is left outside @offset and that no I/O
1486 * is underway against any of the blocks which are outside the truncation
1487 * point.  Because the caller is about to free (and possibly reuse) those
1488 * blocks on-disk.
1489 */
1490void block_invalidatepage(struct page *page, unsigned long offset)
1491{
1492        struct buffer_head *head, *bh, *next;
1493        unsigned int curr_off = 0;
1494
1495        BUG_ON(!PageLocked(page));
1496        if (!page_has_buffers(page))
1497                goto out;
1498
1499        head = page_buffers(page);
1500        bh = head;
1501        do {
1502                unsigned int next_off = curr_off + bh->b_size;
1503                next = bh->b_this_page;
1504
1505                /*
1506                 * is this block fully invalidated?
1507                 */
1508                if (offset <= curr_off)
1509                        discard_buffer(bh);
1510                curr_off = next_off;
1511                bh = next;
1512        } while (bh != head);
1513
1514        /*
1515         * We release buffers only if the entire page is being invalidated.
1516         * The get_block cached value has been unconditionally invalidated,
1517         * so real IO is not possible anymore.
1518         */
1519        if (offset == 0)
1520                try_to_release_page(page, 0);
1521out:
1522        return;
1523}
1524EXPORT_SYMBOL(block_invalidatepage);
1525
1526/*
1527 * We attach and possibly dirty the buffers atomically wrt
1528 * __set_page_dirty_buffers() via private_lock.  try_to_free_buffers
1529 * is already excluded via the page lock.
1530 */
1531void create_empty_buffers(struct page *page,
1532                        unsigned long blocksize, unsigned long b_state)
1533{
1534        struct buffer_head *bh, *head, *tail;
1535
1536        head = alloc_page_buffers(page, blocksize, 1);
1537        bh = head;
1538        do {
1539                bh->b_state |= b_state;
1540                tail = bh;
1541                bh = bh->b_this_page;
1542        } while (bh);
1543        tail->b_this_page = head;
1544
1545        spin_lock(&page->mapping->private_lock);
1546        if (PageUptodate(page) || PageDirty(page)) {
1547                bh = head;
1548                do {
1549                        if (PageDirty(page))
1550                                set_buffer_dirty(bh);
1551                        if (PageUptodate(page))
1552                                set_buffer_uptodate(bh);
1553                        bh = bh->b_this_page;
1554                } while (bh != head);
1555        }
1556        attach_page_buffers(page, head);
1557        spin_unlock(&page->mapping->private_lock);
1558}
1559EXPORT_SYMBOL(create_empty_buffers);
1560
1561/*
1562 * We are taking a block for data and we don't want any output from any
1563 * buffer-cache aliases starting from return from that function and
1564 * until the moment when something will explicitly mark the buffer
1565 * dirty (hopefully that will not happen until we will free that block ;-)
1566 * We don't even need to mark it not-uptodate - nobody can expect
1567 * anything from a newly allocated buffer anyway. We used to used
1568 * unmap_buffer() for such invalidation, but that was wrong. We definitely
1569 * don't want to mark the alias unmapped, for example - it would confuse
1570 * anyone who might pick it with bread() afterwards...
1571 *
1572 * Also..  Note that bforget() doesn't lock the buffer.  So there can
1573 * be writeout I/O going on against recently-freed buffers.  We don't
1574 * wait on that I/O in bforget() - it's more efficient to wait on the I/O
1575 * only if we really need to.  That happens here.
1576 */
1577void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
1578{
1579        struct buffer_head *old_bh;
1580
1581        might_sleep();
1582
1583        old_bh = __find_get_block_slow(bdev, block);
1584        if (old_bh) {
1585                clear_buffer_dirty(old_bh);
1586                wait_on_buffer(old_bh);
1587                clear_buffer_req(old_bh);
1588                __brelse(old_bh);
1589        }
1590}
1591EXPORT_SYMBOL(unmap_underlying_metadata);
1592
1593/*
1594 * NOTE! All mapped/uptodate combinations are valid:
1595 *
1596 *      Mapped  Uptodate        Meaning
1597 *
1598 *      No      No              "unknown" - must do get_block()
1599 *      No      Yes             "hole" - zero-filled
1600 *      Yes     No              "allocated" - allocated on disk, not read in
1601 *      Yes     Yes             "valid" - allocated and up-to-date in memory.
1602 *
1603 * "Dirty" is valid only with the last case (mapped+uptodate).
1604 */
1605
1606/*
1607 * While block_write_full_page is writing back the dirty buffers under
1608 * the page lock, whoever dirtied the buffers may decide to clean them
1609 * again at any time.  We handle that by only looking at the buffer
1610 * state inside lock_buffer().
1611 *
1612 * If block_write_full_page() is called for regular writeback
1613 * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1614 * locked buffer.   This only can happen if someone has written the buffer
1615 * directly, with submit_bh().  At the address_space level PageWriteback
1616 * prevents this contention from occurring.
1617 *
1618 * If block_write_full_page() is called with wbc->sync_mode ==
1619 * WB_SYNC_ALL, the writes are posted using WRITE_SYNC; this
1620 * causes the writes to be flagged as synchronous writes.
1621 */
1622static int __block_write_full_page(struct inode *inode, struct page *page,
1623                        get_block_t *get_block, struct writeback_control *wbc,
1624                        bh_end_io_t *handler)
1625{
1626        int err;
1627        sector_t block;
1628        sector_t last_block;
1629        struct buffer_head *bh, *head;
1630        const unsigned blocksize = 1 << inode->i_blkbits;
1631        int nr_underway = 0;
1632        int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
1633                        WRITE_SYNC : WRITE);
1634
1635        BUG_ON(!PageLocked(page));
1636
1637        last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
1638
1639        if (!page_has_buffers(page)) {
1640                create_empty_buffers(page, blocksize,
1641                                        (1 << BH_Dirty)|(1 << BH_Uptodate));
1642        }
1643
1644        /*
1645         * Be very careful.  We have no exclusion from __set_page_dirty_buffers
1646         * here, and the (potentially unmapped) buffers may become dirty at
1647         * any time.  If a buffer becomes dirty here after we've inspected it
1648         * then we just miss that fact, and the page stays dirty.
1649         *
1650         * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1651         * handle that here by just cleaning them.
1652         */
1653
1654        block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1655        head = page_buffers(page);
1656        bh = head;
1657
1658        /*
1659         * Get all the dirty buffers mapped to disk addresses and
1660         * handle any aliases from the underlying blockdev's mapping.
1661         */
1662        do {
1663                if (block > last_block) {
1664                        /*
1665                         * mapped buffers outside i_size will occur, because
1666                         * this page can be outside i_size when there is a
1667                         * truncate in progress.
1668                         */
1669                        /*
1670                         * The buffer was zeroed by block_write_full_page()
1671                         */
1672                        clear_buffer_dirty(bh);
1673                        set_buffer_uptodate(bh);
1674                } else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
1675                           buffer_dirty(bh)) {
1676                        WARN_ON(bh->b_size != blocksize);
1677                        err = get_block(inode, block, bh, 1);
1678                        if (err)
1679                                goto recover;
1680                        clear_buffer_delay(bh);
1681                        if (buffer_new(bh)) {
1682                                /* blockdev mappings never come here */
1683                                clear_buffer_new(bh);
1684                                unmap_underlying_metadata(bh->b_bdev,
1685                                                        bh->b_blocknr);
1686                        }
1687                }
1688                bh = bh->b_this_page;
1689                block++;
1690        } while (bh != head);
1691
1692        do {
1693                if (!buffer_mapped(bh))
1694                        continue;
1695                /*
1696                 * If it's a fully non-blocking write attempt and we cannot
1697                 * lock the buffer then redirty the page.  Note that this can
1698                 * potentially cause a busy-wait loop from writeback threads
1699                 * and kswapd activity, but those code paths have their own
1700                 * higher-level throttling.
1701                 */
1702                if (wbc->sync_mode != WB_SYNC_NONE) {
1703                        lock_buffer(bh);
1704                } else if (!trylock_buffer(bh)) {
1705                        redirty_page_for_writepage(wbc, page);
1706                        continue;
1707                }
1708                if (test_clear_buffer_dirty(bh)) {
1709                        mark_buffer_async_write_endio(bh, handler);
1710                } else {
1711                        unlock_buffer(bh);
1712                }
1713        } while ((bh = bh->b_this_page) != head);
1714
1715        /*
1716         * The page and its buffers are protected by PageWriteback(), so we can
1717         * drop the bh refcounts early.
1718         */
1719        BUG_ON(PageWriteback(page));
1720        set_page_writeback(page);
1721
1722        do {
1723                struct buffer_head *next = bh->b_this_page;
1724                if (buffer_async_write(bh)) {
1725                        submit_bh(write_op, bh);
1726                        nr_underway++;
1727                }
1728                bh = next;
1729        } while (bh != head);
1730        unlock_page(page);
1731
1732        err = 0;
1733done:
1734        if (nr_underway == 0) {
1735                /*
1736                 * The page was marked dirty, but the buffers were
1737                 * clean.  Someone wrote them back by hand with
1738                 * ll_rw_block/submit_bh.  A rare case.
1739                 */
1740                end_page_writeback(page);
1741
1742                /*
1743                 * The page and buffer_heads can be released at any time from
1744                 * here on.
1745                 */
1746        }
1747        return err;
1748
1749recover:
1750        /*
1751         * ENOSPC, or some other error.  We may already have added some
1752         * blocks to the file, so we need to write these out to avoid
1753         * exposing stale data.
1754         * The page is currently locked and not marked for writeback
1755         */
1756        bh = head;
1757        /* Recovery: lock and submit the mapped buffers */
1758        do {
1759                if (buffer_mapped(bh) && buffer_dirty(bh) &&
1760                    !buffer_delay(bh)) {
1761                        lock_buffer(bh);
1762                        mark_buffer_async_write_endio(bh, handler);
1763                } else {
1764                        /*
1765                         * The buffer may have been set dirty during
1766                         * attachment to a dirty page.
1767                         */
1768                        clear_buffer_dirty(bh);
1769                }
1770        } while ((bh = bh->b_this_page) != head);
1771        SetPageError(page);
1772        BUG_ON(PageWriteback(page));
1773        mapping_set_error(page->mapping, err);
1774        set_page_writeback(page);
1775        do {
1776                struct buffer_head *next = bh->b_this_page;
1777                if (buffer_async_write(bh)) {
1778                        clear_buffer_dirty(bh);
1779                        submit_bh(write_op, bh);
1780                        nr_underway++;
1781                }
1782                bh = next;
1783        } while (bh != head);
1784        unlock_page(page);
1785        goto done;
1786}
1787
1788/*
1789 * If a page has any new buffers, zero them out here, and mark them uptodate
1790 * and dirty so they'll be written out (in order to prevent uninitialised
1791 * block data from leaking). And clear the new bit.
1792 */
1793void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
1794{
1795        unsigned int block_start, block_end;
1796        struct buffer_head *head, *bh;
1797
1798        BUG_ON(!PageLocked(page));
1799        if (!page_has_buffers(page))
1800                return;
1801
1802        bh = head = page_buffers(page);
1803        block_start = 0;
1804        do {
1805                block_end = block_start + bh->b_size;
1806
1807                if (buffer_new(bh)) {
1808                        if (block_end > from && block_start < to) {
1809                                if (!PageUptodate(page)) {
1810                                        unsigned start, size;
1811
1812                                        start = max(from, block_start);
1813                                        size = min(to, block_end) - start;
1814
1815                                        zero_user(page, start, size);
1816                                        set_buffer_uptodate(bh);
1817                                }
1818
1819                                clear_buffer_new(bh);
1820                                mark_buffer_dirty(bh);
1821                        }
1822                }
1823
1824                block_start = block_end;
1825                bh = bh->b_this_page;
1826        } while (bh != head);
1827}
1828EXPORT_SYMBOL(page_zero_new_buffers);
1829
1830int __block_write_begin(struct page *page, loff_t pos, unsigned len,
1831                get_block_t *get_block)
1832{
1833        unsigned from = pos & (PAGE_CACHE_SIZE - 1);
1834        unsigned to = from + len;
1835        struct inode *inode = page->mapping->host;
1836        unsigned block_start, block_end;
1837        sector_t block;
1838        int err = 0;
1839        unsigned blocksize, bbits;
1840        struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1841
1842        BUG_ON(!PageLocked(page));
1843        BUG_ON(from > PAGE_CACHE_SIZE);
1844        BUG_ON(to > PAGE_CACHE_SIZE);
1845        BUG_ON(from > to);
1846
1847        blocksize = 1 << inode->i_blkbits;
1848        if (!page_has_buffers(page))
1849                create_empty_buffers(page, blocksize, 0);
1850        head = page_buffers(page);
1851
1852        bbits = inode->i_blkbits;
1853        block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
1854
1855        for(bh = head, block_start = 0; bh != head || !block_start;
1856            block++, block_start=block_end, bh = bh->b_this_page) {
1857                block_end = block_start + blocksize;
1858                if (block_end <= from || block_start >= to) {
1859                        if (PageUptodate(page)) {
1860                                if (!buffer_uptodate(bh))
1861                                        set_buffer_uptodate(bh);
1862                        }
1863                        continue;
1864                }
1865                if (buffer_new(bh))
1866                        clear_buffer_new(bh);
1867                if (!buffer_mapped(bh)) {
1868                        WARN_ON(bh->b_size != blocksize);
1869                        err = get_block(inode, block, bh, 1);
1870                        if (err)
1871                                break;
1872                        if (buffer_new(bh)) {
1873                                unmap_underlying_metadata(bh->b_bdev,
1874                                                        bh->b_blocknr);
1875                                if (PageUptodate(page)) {
1876                                        clear_buffer_new(bh);
1877                                        set_buffer_uptodate(bh);
1878                                        mark_buffer_dirty(bh);
1879                                        continue;
1880                                }
1881                                if (block_end > to || block_start < from)
1882                                        zero_user_segments(page,
1883                                                to, block_end,
1884                                                block_start, from);
1885                                continue;
1886                        }
1887                }
1888                if (PageUptodate(page)) {
1889                        if (!buffer_uptodate(bh))
1890                                set_buffer_uptodate(bh);
1891                        continue; 
1892                }
1893                if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
1894                    !buffer_unwritten(bh) &&
1895                     (block_start < from || block_end > to)) {
1896                        ll_rw_block(READ, 1, &bh);
1897                        *wait_bh++=bh;
1898                }
1899        }
1900        /*
1901         * If we issued read requests - let them complete.
1902         */
1903        while(wait_bh > wait) {
1904                wait_on_buffer(*--wait_bh);
1905                if (!buffer_uptodate(*wait_bh))
1906                        err = -EIO;
1907        }
1908        if (unlikely(err))
1909                page_zero_new_buffers(page, from, to);
1910        return err;
1911}
1912EXPORT_SYMBOL(__block_write_begin);
1913
1914static int __block_commit_write(struct inode *inode, struct page *page,
1915                unsigned from, unsigned to)
1916{
1917        unsigned block_start, block_end;
1918        int partial = 0;
1919        unsigned blocksize;
1920        struct buffer_head *bh, *head;
1921
1922        blocksize = 1 << inode->i_blkbits;
1923
1924        for(bh = head = page_buffers(page), block_start = 0;
1925            bh != head || !block_start;
1926            block_start=block_end, bh = bh->b_this_page) {
1927                block_end = block_start + blocksize;
1928                if (block_end <= from || block_start >= to) {
1929                        if (!buffer_uptodate(bh))
1930                                partial = 1;
1931                } else {
1932                        set_buffer_uptodate(bh);
1933                        mark_buffer_dirty(bh);
1934                }
1935                clear_buffer_new(bh);
1936        }
1937
1938        /*
1939         * If this is a partial write which happened to make all buffers
1940         * uptodate then we can optimize away a bogus readpage() for
1941         * the next read(). Here we 'discover' whether the page went
1942         * uptodate as a result of this (potentially partial) write.
1943         */
1944        if (!partial)
1945                SetPageUptodate(page);
1946        return 0;
1947}
1948
1949/*
1950 * block_write_begin takes care of the basic task of block allocation and
1951 * bringing partial write blocks uptodate first.
1952 *
1953 * The filesystem needs to handle block truncation upon failure.
1954 */
1955int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
1956                unsigned flags, struct page **pagep, get_block_t *get_block)
1957{
1958        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1959        struct page *page;
1960        int status;
1961
1962        page = grab_cache_page_write_begin(mapping, index, flags);
1963        if (!page)
1964                return -ENOMEM;
1965
1966        status = __block_write_begin(page, pos, len, get_block);
1967        if (unlikely(status)) {
1968                unlock_page(page);
1969                page_cache_release(page);
1970                page = NULL;
1971        }
1972
1973        *pagep = page;
1974        return status;
1975}
1976EXPORT_SYMBOL(block_write_begin);
1977
1978int block_write_end(struct file *file, struct address_space *mapping,
1979                        loff_t pos, unsigned len, unsigned copied,
1980                        struct page *page, void *fsdata)
1981{
1982        struct inode *inode = mapping->host;
1983        unsigned start;
1984
1985        start = pos & (PAGE_CACHE_SIZE - 1);
1986
1987        if (unlikely(copied < len)) {
1988                /*
1989                 * The buffers that were written will now be uptodate, so we
1990                 * don't have to worry about a readpage reading them and
1991                 * overwriting a partial write. However if we have encountered
1992                 * a short write and only partially written into a buffer, it
1993                 * will not be marked uptodate, so a readpage might come in and
1994                 * destroy our partial write.
1995                 *
1996                 * Do the simplest thing, and just treat any short write to a
1997                 * non uptodate page as a zero-length write, and force the
1998                 * caller to redo the whole thing.
1999                 */
2000                if (!PageUptodate(page))
2001                        copied = 0;
2002
2003                page_zero_new_buffers(page, start+copied, start+len);
2004        }
2005        flush_dcache_page(page);
2006
2007        /* This could be a short (even 0-length) commit */
2008        __block_commit_write(inode, page, start, start+copied);
2009
2010        return copied;
2011}
2012EXPORT_SYMBOL(block_write_end);
2013
2014int generic_write_end(struct file *file, struct address_space *mapping,
2015                        loff_t pos, unsigned len, unsigned copied,
2016                        struct page *page, void *fsdata)
2017{
2018        struct inode *inode = mapping->host;
2019        int i_size_changed = 0;
2020
2021        copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
2022
2023        /*
2024         * No need to use i_size_read() here, the i_size
2025         * cannot change under us because we hold i_mutex.
2026         *
2027         * But it's important to update i_size while still holding page lock:
2028         * page writeout could otherwise come in and zero beyond i_size.
2029         */
2030        if (pos+copied > inode->i_size) {
2031                i_size_write(inode, pos+copied);
2032                i_size_changed = 1;
2033        }
2034
2035        unlock_page(page);
2036        page_cache_release(page);
2037
2038        /*
2039         * Don't mark the inode dirty under page lock. First, it unnecessarily
2040         * makes the holding time of page lock longer. Second, it forces lock
2041         * ordering of page lock and transaction start for journaling
2042         * filesystems.
2043         */
2044        if (i_size_changed)
2045                mark_inode_dirty(inode);
2046
2047        return copied;
2048}
2049EXPORT_SYMBOL(generic_write_end);
2050
2051/*
2052 * block_is_partially_uptodate checks whether buffers within a page are
2053 * uptodate or not.
2054 *
2055 * Returns true if all buffers which correspond to a file portion
2056 * we want to read are uptodate.
2057 */
2058int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
2059                                        unsigned long from)
2060{
2061        struct inode *inode = page->mapping->host;
2062        unsigned block_start, block_end, blocksize;
2063        unsigned to;
2064        struct buffer_head *bh, *head;
2065        int ret = 1;
2066
2067        if (!page_has_buffers(page))
2068                return 0;
2069
2070        blocksize = 1 << inode->i_blkbits;
2071        to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count);
2072        to = from + to;
2073        if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
2074                return 0;
2075
2076        head = page_buffers(page);
2077        bh = head;
2078        block_start = 0;
2079        do {
2080                block_end = block_start + blocksize;
2081                if (block_end > from && block_start < to) {
2082                        if (!buffer_uptodate(bh)) {
2083                                ret = 0;
2084                                break;
2085                        }
2086                        if (block_end >= to)
2087                                break;
2088                }
2089                block_start = block_end;
2090                bh = bh->b_this_page;
2091        } while (bh != head);
2092
2093        return ret;
2094}
2095EXPORT_SYMBOL(block_is_partially_uptodate);
2096
2097/*
2098 * Generic "read page" function for block devices that have the normal
2099 * get_block functionality. This is most of the block device filesystems.
2100 * Reads the page asynchronously --- the unlock_buffer() and
2101 * set/clear_buffer_uptodate() functions propagate buffer state into the
2102 * page struct once IO has completed.
2103 */
2104int block_read_full_page(struct page *page, get_block_t *get_block)
2105{
2106        struct inode *inode = page->mapping->host;
2107        sector_t iblock, lblock;
2108        struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
2109        unsigned int blocksize;
2110        int nr, i;
2111        int fully_mapped = 1;
2112
2113        BUG_ON(!PageLocked(page));
2114        blocksize = 1 << inode->i_blkbits;
2115        if (!page_has_buffers(page))
2116                create_empty_buffers(page, blocksize, 0);
2117        head = page_buffers(page);
2118
2119        iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2120        lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
2121        bh = head;
2122        nr = 0;
2123        i = 0;
2124
2125        do {
2126                if (buffer_uptodate(bh))
2127                        continue;
2128
2129                if (!buffer_mapped(bh)) {
2130                        int err = 0;
2131
2132                        fully_mapped = 0;
2133                        if (iblock < lblock) {
2134                                WARN_ON(bh->b_size != blocksize);
2135                                err = get_block(inode, iblock, bh, 0);
2136                                if (err)
2137                                        SetPageError(page);
2138                        }
2139                        if (!buffer_mapped(bh)) {
2140                                zero_user(page, i * blocksize, blocksize);
2141                                if (!err)
2142                                        set_buffer_uptodate(bh);
2143                                continue;
2144                        }
2145                        /*
2146                         * get_block() might have updated the buffer
2147                         * synchronously
2148                         */
2149                        if (buffer_uptodate(bh))
2150                                continue;
2151                }
2152                arr[nr++] = bh;
2153        } while (i++, iblock++, (bh = bh->b_this_page) != head);
2154
2155        if (fully_mapped)
2156                SetPageMappedToDisk(page);
2157
2158        if (!nr) {
2159                /*
2160                 * All buffers are uptodate - we can set the page uptodate
2161                 * as well. But not if get_block() returned an error.
2162                 */
2163                if (!PageError(page))
2164                        SetPageUptodate(page);
2165                unlock_page(page);
2166                return 0;
2167        }
2168
2169        /* Stage two: lock the buffers */
2170        for (i = 0; i < nr; i++) {
2171                bh = arr[i];
2172                lock_buffer(bh);
2173                mark_buffer_async_read(bh);
2174        }
2175
2176        /*
2177         * Stage 3: start the IO.  Check for uptodateness
2178         * inside the buffer lock in case another process reading
2179         * the underlying blockdev brought it uptodate (the sct fix).
2180         */
2181        for (i = 0; i < nr; i++) {
2182                bh = arr[i];
2183                if (buffer_uptodate(bh))
2184                        end_buffer_async_read(bh, 1);
2185                else
2186                        submit_bh(READ, bh);
2187        }
2188        return 0;
2189}
2190EXPORT_SYMBOL(block_read_full_page);
2191
2192/* utility function for filesystems that need to do work on expanding
2193 * truncates.  Uses filesystem pagecache writes to allow the filesystem to
2194 * deal with the hole.  
2195 */
2196int generic_cont_expand_simple(struct inode *inode, loff_t size)
2197{
2198        struct address_space *mapping = inode->i_mapping;
2199        struct page *page;
2200        void *fsdata;
2201        int err;
2202
2203        err = inode_newsize_ok(inode, size);
2204        if (err)
2205                goto out;
2206
2207        err = pagecache_write_begin(NULL, mapping, size, 0,
2208                                AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND,
2209                                &page, &fsdata);
2210        if (err)
2211                goto out;
2212
2213        err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata);
2214        BUG_ON(err > 0);
2215
2216out:
2217        return err;
2218}
2219EXPORT_SYMBOL(generic_cont_expand_simple);
2220
2221static int cont_expand_zero(struct file *file, struct address_space *mapping,
2222                            loff_t pos, loff_t *bytes)
2223{
2224        struct inode *inode = mapping->host;
2225        unsigned blocksize = 1 << inode->i_blkbits;
2226        struct page *page;
2227        void *fsdata;
2228        pgoff_t index, curidx;
2229        loff_t curpos;
2230        unsigned zerofrom, offset, len;
2231        int err = 0;
2232
2233        index = pos >> PAGE_CACHE_SHIFT;
2234        offset = pos & ~PAGE_CACHE_MASK;
2235
2236        while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) {
2237                zerofrom = curpos & ~PAGE_CACHE_MASK;
2238                if (zerofrom & (blocksize-1)) {
2239                        *bytes |= (blocksize-1);
2240                        (*bytes)++;
2241                }
2242                len = PAGE_CACHE_SIZE - zerofrom;
2243
2244                err = pagecache_write_begin(file, mapping, curpos, len,
2245                                                AOP_FLAG_UNINTERRUPTIBLE,
2246                                                &page, &fsdata);
2247                if (err)
2248                        goto out;
2249                zero_user(page, zerofrom, len);
2250                err = pagecache_write_end(file, mapping, curpos, len, len,
2251                                                page, fsdata);
2252                if (err < 0)
2253                        goto out;
2254                BUG_ON(err != len);
2255                err = 0;
2256
2257                balance_dirty_pages_ratelimited(mapping);
2258        }
2259
2260        /* page covers the boundary, find the boundary offset */
2261        if (index == curidx) {
2262                zerofrom = curpos & ~PAGE_CACHE_MASK;
2263                /* if we will expand the thing last block will be filled */
2264                if (offset <= zerofrom) {
2265                        goto out;
2266                }
2267                if (zerofrom & (blocksize-1)) {
2268                        *bytes |= (blocksize-1);
2269                        (*bytes)++;
2270                }
2271                len = offset - zerofrom;
2272
2273                err = pagecache_write_begin(file, mapping, curpos, len,
2274                                                AOP_FLAG_UNINTERRUPTIBLE,
2275                                                &page, &fsdata);
2276                if (err)
2277                        goto out;
2278                zero_user(page, zerofrom, len);
2279                err = pagecache_write_end(file, mapping, curpos, len, len,
2280                                                page, fsdata);
2281                if (err < 0)
2282                        goto out;
2283                BUG_ON(err != len);
2284                err = 0;
2285        }
2286out:
2287        return err;
2288}
2289
2290/*
2291 * For moronic filesystems that do not allow holes in file.
2292 * We may have to extend the file.
2293 */
2294int cont_write_begin(struct file *file, struct address_space *mapping,
2295                        loff_t pos, unsigned len, unsigned flags,
2296                        struct page **pagep, void **fsdata,
2297                        get_block_t *get_block, loff_t *bytes)
2298{
2299        struct inode *inode = mapping->host;
2300        unsigned blocksize = 1 << inode->i_blkbits;
2301        unsigned zerofrom;
2302        int err;
2303
2304        err = cont_expand_zero(file, mapping, pos, bytes);
2305        if (err)
2306                return err;
2307
2308        zerofrom = *bytes & ~PAGE_CACHE_MASK;
2309        if (pos+len > *bytes && zerofrom & (blocksize-1)) {
2310                *bytes |= (blocksize-1);
2311                (*bytes)++;
2312        }
2313
2314        return block_write_begin(mapping, pos, len, flags, pagep, get_block);
2315}
2316EXPORT_SYMBOL(cont_write_begin);
2317
2318int block_commit_write(struct page *page, unsigned from, unsigned to)
2319{
2320        struct inode *inode = page->mapping->host;
2321        __block_commit_write(inode,page,from,to);
2322        return 0;
2323}
2324EXPORT_SYMBOL(block_commit_write);
2325
2326/*
2327 * block_page_mkwrite() is not allowed to change the file size as it gets
2328 * called from a page fault handler when a page is first dirtied. Hence we must
2329 * be careful to check for EOF conditions here. We set the page up correctly
2330 * for a written page which means we get ENOSPC checking when writing into
2331 * holes and correct delalloc and unwritten extent mapping on filesystems that
2332 * support these features.
2333 *
2334 * We are not allowed to take the i_mutex here so we have to play games to
2335 * protect against truncate races as the page could now be beyond EOF.  Because
2336 * truncate writes the inode size before removing pages, once we have the
2337 * page lock we can determine safely if the page is beyond EOF. If it is not
2338 * beyond EOF, then the page is guaranteed safe against truncation until we
2339 * unlock the page.
2340 *
2341 * Direct callers of this function should call vfs_check_frozen() so that page
2342 * fault does not busyloop until the fs is thawed.
2343 */
2344int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2345                         get_block_t get_block)
2346{
2347        struct page *page = vmf->page;
2348        struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
2349        unsigned long end;
2350        loff_t size;
2351        int ret;
2352
2353        lock_page(page);
2354        size = i_size_read(inode);
2355        if ((page->mapping != inode->i_mapping) ||
2356            (page_offset(page) > size)) {
2357                /* We overload EFAULT to mean page got truncated */
2358                ret = -EFAULT;
2359                goto out_unlock;
2360        }
2361
2362        /* page is wholly or partially inside EOF */
2363        if (((page->index + 1) << PAGE_CACHE_SHIFT) > size)
2364                end = size & ~PAGE_CACHE_MASK;
2365        else
2366                end = PAGE_CACHE_SIZE;
2367
2368        ret = __block_write_begin(page, 0, end, get_block);
2369        if (!ret)
2370                ret = block_commit_write(page, 0, end);
2371
2372        if (unlikely(ret < 0))
2373                goto out_unlock;
2374        /*
2375         * Freezing in progress? We check after the page is marked dirty and
2376         * with page lock held so if the test here fails, we are sure freezing
2377         * code will wait during syncing until the page fault is done - at that
2378         * point page will be dirty and unlocked so freezing code will write it
2379         * and writeprotect it again.
2380         */
2381        set_page_dirty(page);
2382        if (inode->i_sb->s_frozen != SB_UNFROZEN) {
2383                ret = -EAGAIN;
2384                goto out_unlock;
2385        }
2386        wait_on_page_writeback(page);
2387        return 0;
2388out_unlock:
2389        unlock_page(page);
2390        return ret;
2391}
2392EXPORT_SYMBOL(__block_page_mkwrite);
2393
2394int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2395                   get_block_t get_block)
2396{
2397        int ret;
2398        struct super_block *sb = vma->vm_file->f_path.dentry->d_inode->i_sb;
2399
2400        /*
2401         * This check is racy but catches the common case. The check in
2402         * __block_page_mkwrite() is reliable.
2403         */
2404        vfs_check_frozen(sb, SB_FREEZE_WRITE);
2405        ret = __block_page_mkwrite(vma, vmf, get_block);
2406        return block_page_mkwrite_return(ret);
2407}
2408EXPORT_SYMBOL(block_page_mkwrite);
2409
2410/*
2411 * nobh_write_begin()'s prereads are special: the buffer_heads are freed
2412 * immediately, while under the page lock.  So it needs a special end_io
2413 * handler which does not touch the bh after unlocking it.
2414 */
2415static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
2416{
2417        __end_buffer_read_notouch(bh, uptodate);
2418}
2419
2420/*
2421 * Attach the singly-linked list of buffers created by nobh_write_begin, to
2422 * the page (converting it to circular linked list and taking care of page
2423 * dirty races).
2424 */
2425static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
2426{
2427        struct buffer_head *bh;
2428
2429        BUG_ON(!PageLocked(page));
2430
2431        spin_lock(&page->mapping->private_lock);
2432        bh = head;
2433        do {
2434                if (PageDirty(page))
2435                        set_buffer_dirty(bh);
2436                if (!bh->b_this_page)
2437                        bh->b_this_page = head;
2438                bh = bh->b_this_page;
2439        } while (bh != head);
2440        attach_page_buffers(page, head);
2441        spin_unlock(&page->mapping->private_lock);
2442}
2443
2444/*
2445 * On entry, the page is fully not uptodate.
2446 * On exit the page is fully uptodate in the areas outside (from,to)
2447 * The filesystem needs to handle block truncation upon failure.
2448 */
2449int nobh_write_begin(struct address_space *mapping,
2450                        loff_t pos, unsigned len, unsigned flags,
2451                        struct page **pagep, void **fsdata,
2452                        get_block_t *get_block)
2453{
2454        struct inode *inode = mapping->host;
2455        const unsigned blkbits = inode->i_blkbits;
2456        const unsigned blocksize = 1 << blkbits;
2457        struct buffer_head *head, *bh;
2458        struct page *page;
2459        pgoff_t index;
2460        unsigned from, to;
2461        unsigned block_in_page;
2462        unsigned block_start, block_end;
2463        sector_t block_in_file;
2464        int nr_reads = 0;
2465        int ret = 0;
2466        int is_mapped_to_disk = 1;
2467
2468        index = pos >> PAGE_CACHE_SHIFT;
2469        from = pos & (PAGE_CACHE_SIZE - 1);
2470        to = from + len;
2471
2472        page = grab_cache_page_write_begin(mapping, index, flags);
2473        if (!page)
2474                return -ENOMEM;
2475        *pagep = page;
2476        *fsdata = NULL;
2477
2478        if (page_has_buffers(page)) {
2479                ret = __block_write_begin(page, pos, len, get_block);
2480                if (unlikely(ret))
2481                        goto out_release;
2482                return ret;
2483        }
2484
2485        if (PageMappedToDisk(page))
2486                return 0;
2487
2488        /*
2489         * Allocate buffers so that we can keep track of state, and potentially
2490         * attach them to the page if an error occurs. In the common case of
2491         * no error, they will just be freed again without ever being attached
2492         * to the page (which is all OK, because we're under the page lock).
2493         *
2494         * Be careful: the buffer linked list is a NULL terminated one, rather
2495         * than the circular one we're used to.
2496         */
2497        head = alloc_page_buffers(page, blocksize, 0);
2498        if (!head) {
2499                ret = -ENOMEM;
2500                goto out_release;
2501        }
2502
2503        block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
2504
2505        /*
2506         * We loop across all blocks in the page, whether or not they are
2507         * part of the affected region.  This is so we can discover if the
2508         * page is fully mapped-to-disk.
2509         */
2510        for (block_start = 0, block_in_page = 0, bh = head;
2511                  block_start < PAGE_CACHE_SIZE;
2512                  block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
2513                int create;
2514
2515                block_end = block_start + blocksize;
2516                bh->b_state = 0;
2517                create = 1;
2518                if (block_start >= to)
2519                        create = 0;
2520                ret = get_block(inode, block_in_file + block_in_page,
2521                                        bh, create);
2522                if (ret)
2523                        goto failed;
2524                if (!buffer_mapped(bh))
2525                        is_mapped_to_disk = 0;
2526                if (buffer_new(bh))
2527                        unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
2528                if (PageUptodate(page)) {
2529                        set_buffer_uptodate(bh);
2530                        continue;
2531                }
2532                if (buffer_new(bh) || !buffer_mapped(bh)) {
2533                        zero_user_segments(page, block_start, from,
2534                                                        to, block_end);
2535                        continue;
2536                }
2537                if (buffer_uptodate(bh))
2538                        continue;       /* reiserfs does this */
2539                if (block_start < from || block_end > to) {
2540                        lock_buffer(bh);
2541                        bh->b_end_io = end_buffer_read_nobh;
2542                        submit_bh(READ, bh);
2543                        nr_reads++;
2544                }
2545        }
2546
2547        if (nr_reads) {
2548                /*
2549                 * The page is locked, so these buffers are protected from
2550                 * any VM or truncate activity.  Hence we don't need to care
2551                 * for the buffer_head refcounts.
2552                 */
2553                for (bh = head; bh; bh = bh->b_this_page) {
2554                        wait_on_buffer(bh);
2555                        if (!buffer_uptodate(bh))
2556                                ret = -EIO;
2557                }
2558                if (ret)
2559                        goto failed;
2560        }
2561
2562        if (is_mapped_to_disk)
2563                SetPageMappedToDisk(page);
2564
2565        *fsdata = head; /* to be released by nobh_write_end */
2566
2567        return 0;
2568
2569failed:
2570        BUG_ON(!ret);
2571        /*
2572         * Error recovery is a bit difficult. We need to zero out blocks that
2573         * were newly allocated, and dirty them to ensure they get written out.
2574         * Buffers need to be attached to the page at this point, otherwise
2575         * the handling of potential IO errors during writeout would be hard
2576         * (could try doing synchronous writeout, but what if that fails too?)
2577         */
2578        attach_nobh_buffers(page, head);
2579        page_zero_new_buffers(page, from, to);
2580
2581out_release:
2582        unlock_page(page);
2583        page_cache_release(page);
2584        *pagep = NULL;
2585
2586        return ret;
2587}
2588EXPORT_SYMBOL(nobh_write_begin);
2589
2590int nobh_write_end(struct file *file, struct address_space *mapping,
2591                        loff_t pos, unsigned len, unsigned copied,
2592                        struct page *page, void *fsdata)
2593{
2594        struct inode *inode = page->mapping->host;
2595        struct buffer_head *head = fsdata;
2596        struct buffer_head *bh;
2597        BUG_ON(fsdata != NULL && page_has_buffers(page));
2598
2599        if (unlikely(copied < len) && head)
2600                attach_nobh_buffers(page, head);
2601        if (page_has_buffers(page))
2602                return generic_write_end(file, mapping, pos, len,
2603                                        copied, page, fsdata);
2604
2605        SetPageUptodate(page);
2606        set_page_dirty(page);
2607        if (pos+copied > inode->i_size) {
2608                i_size_write(inode, pos+copied);
2609                mark_inode_dirty(inode);
2610        }
2611
2612        unlock_page(page);
2613        page_cache_release(page);
2614
2615        while (head) {
2616                bh = head;
2617                head = head->b_this_page;
2618                free_buffer_head(bh);
2619        }
2620
2621        return copied;
2622}
2623EXPORT_SYMBOL(nobh_write_end);
2624
2625/*
2626 * nobh_writepage() - based on block_full_write_page() except
2627 * that it tries to operate without attaching bufferheads to
2628 * the page.
2629 */
2630int nobh_writepage(struct page *page, get_block_t *get_block,
2631                        struct writeback_control *wbc)
2632{
2633        struct inode * const inode = page->mapping->host;
2634        loff_t i_size = i_size_read(inode);
2635        const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2636        unsigned offset;
2637        int ret;
2638
2639        /* Is the page fully inside i_size? */
2640        if (page->index < end_index)
2641                goto out;
2642
2643        /* Is the page fully outside i_size? (truncate in progress) */
2644        offset = i_size & (PAGE_CACHE_SIZE-1);
2645        if (page->index >= end_index+1 || !offset) {
2646                /*
2647                 * The page may have dirty, unmapped buffers.  For example,
2648                 * they may have been added in ext3_writepage().  Make them
2649                 * freeable here, so the page does not leak.
2650                 */
2651#if 0
2652                /* Not really sure about this  - do we need this ? */
2653                if (page->mapping->a_ops->invalidatepage)
2654                        page->mapping->a_ops->invalidatepage(page, offset);
2655#endif
2656                unlock_page(page);
2657                return 0; /* don't care */
2658        }
2659
2660        /*
2661         * The page straddles i_size.  It must be zeroed out on each and every
2662         * writepage invocation because it may be mmapped.  "A file is mapped
2663         * in multiples of the page size.  For a file that is not a multiple of
2664         * the  page size, the remaining memory is zeroed when mapped, and
2665         * writes to that region are not written out to the file."
2666         */
2667        zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2668out:
2669        ret = mpage_writepage(page, get_block, wbc);
2670        if (ret == -EAGAIN)
2671                ret = __block_write_full_page(inode, page, get_block, wbc,
2672                                              end_buffer_async_write);
2673        return ret;
2674}
2675EXPORT_SYMBOL(nobh_writepage);
2676
2677int nobh_truncate_page(struct address_space *mapping,
2678                        loff_t from, get_block_t *get_block)
2679{
2680        pgoff_t index = from >> PAGE_CACHE_SHIFT;
2681        unsigned offset = from & (PAGE_CACHE_SIZE-1);
2682        unsigned blocksize;
2683        sector_t iblock;
2684        unsigned length, pos;
2685        struct inode *inode = mapping->host;
2686        struct page *page;
2687        struct buffer_head map_bh;
2688        int err;
2689
2690        blocksize = 1 << inode->i_blkbits;
2691        length = offset & (blocksize - 1);
2692
2693        /* Block boundary? Nothing to do */
2694        if (!length)
2695                return 0;
2696
2697        length = blocksize - length;
2698        iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2699
2700        page = grab_cache_page(mapping, index);
2701        err = -ENOMEM;
2702        if (!page)
2703                goto out;
2704
2705        if (page_has_buffers(page)) {
2706has_buffers:
2707                unlock_page(page);
2708                page_cache_release(page);
2709                return block_truncate_page(mapping, from, get_block);
2710        }
2711
2712        /* Find the buffer that contains "offset" */
2713        pos = blocksize;
2714        while (offset >= pos) {
2715                iblock++;
2716                pos += blocksize;
2717        }
2718
2719        map_bh.b_size = blocksize;
2720        map_bh.b_state = 0;
2721        err = get_block(inode, iblock, &map_bh, 0);
2722        if (err)
2723                goto unlock;
2724        /* unmapped? It's a hole - nothing to do */
2725        if (!buffer_mapped(&map_bh))
2726                goto unlock;
2727
2728        /* Ok, it's mapped. Make sure it's up-to-date */
2729        if (!PageUptodate(page)) {
2730                err = mapping->a_ops->readpage(NULL, page);
2731                if (err) {
2732                        page_cache_release(page);
2733                        goto out;
2734                }
2735                lock_page(page);
2736                if (!PageUptodate(page)) {
2737                        err = -EIO;
2738                        goto unlock;
2739                }
2740                if (page_has_buffers(page))
2741                        goto has_buffers;
2742        }
2743        zero_user(page, offset, length);
2744        set_page_dirty(page);
2745        err = 0;
2746
2747unlock:
2748        unlock_page(page);
2749        page_cache_release(page);
2750out:
2751        return err;
2752}
2753EXPORT_SYMBOL(nobh_truncate_page);
2754
2755int block_truncate_page(struct address_space *mapping,
2756                        loff_t from, get_block_t *get_block)
2757{
2758        pgoff_t index = from >> PAGE_CACHE_SHIFT;
2759        unsigned offset = from & (PAGE_CACHE_SIZE-1);
2760        unsigned blocksize;
2761        sector_t iblock;
2762        unsigned length, pos;
2763        struct inode *inode = mapping->host;
2764        struct page *page;
2765        struct buffer_head *bh;
2766        int err;
2767
2768        blocksize = 1 << inode->i_blkbits;
2769        length = offset & (blocksize - 1);
2770
2771        /* Block boundary? Nothing to do */
2772        if (!length)
2773                return 0;
2774
2775        length = blocksize - length;
2776        iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2777        
2778        page = grab_cache_page(mapping, index);
2779        err = -ENOMEM;
2780        if (!page)
2781                goto out;
2782
2783        if (!page_has_buffers(page))
2784                create_empty_buffers(page, blocksize, 0);
2785
2786        /* Find the buffer that contains "offset" */
2787        bh = page_buffers(page);
2788        pos = blocksize;
2789        while (offset >= pos) {
2790                bh = bh->b_this_page;
2791                iblock++;
2792                pos += blocksize;
2793        }
2794
2795        err = 0;
2796        if (!buffer_mapped(bh)) {
2797                WARN_ON(bh->b_size != blocksize);
2798                err = get_block(inode, iblock, bh, 0);
2799                if (err)
2800                        goto unlock;
2801                /* unmapped? It's a hole - nothing to do */
2802                if (!buffer_mapped(bh))
2803                        goto unlock;
2804        }
2805
2806        /* Ok, it's mapped. Make sure it's up-to-date */
2807        if (PageUptodate(page))
2808                set_buffer_uptodate(bh);
2809
2810        if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
2811                err = -EIO;
2812                ll_rw_block(READ, 1, &bh);
2813                wait_on_buffer(bh);
2814                /* Uhhuh. Read error. Complain and punt. */
2815                if (!buffer_uptodate(bh))
2816                        goto unlock;
2817        }
2818
2819        zero_user(page, offset, length);
2820        mark_buffer_dirty(bh);
2821        err = 0;
2822
2823unlock:
2824        unlock_page(page);
2825        page_cache_release(page);
2826out:
2827        return err;
2828}
2829EXPORT_SYMBOL(block_truncate_page);
2830
2831/*
2832 * The generic ->writepage function for buffer-backed address_spaces
2833 * this form passes in the end_io handler used to finish the IO.
2834 */
2835int block_write_full_page_endio(struct page *page, get_block_t *get_block,
2836                        struct writeback_control *wbc, bh_end_io_t *handler)
2837{
2838        struct inode * const inode = page->mapping->host;
2839        loff_t i_size = i_size_read(inode);
2840        const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2841        unsigned offset;
2842
2843        /* Is the page fully inside i_size? */
2844        if (page->index < end_index)
2845                return __block_write_full_page(inode, page, get_block, wbc,
2846                                               handler);
2847
2848        /* Is the page fully outside i_size? (truncate in progress) */
2849        offset = i_size & (PAGE_CACHE_SIZE-1);
2850        if (page->index >= end_index+1 || !offset) {
2851                /*
2852                 * The page may have dirty, unmapped buffers.  For example,
2853                 * they may have been added in ext3_writepage().  Make them
2854                 * freeable here, so the page does not leak.
2855                 */
2856                do_invalidatepage(page, 0);
2857                unlock_page(page);
2858                return 0; /* don't care */
2859        }
2860
2861        /*
2862         * The page straddles i_size.  It must be zeroed out on each and every
2863         * writepage invocation because it may be mmapped.  "A file is mapped
2864         * in multiples of the page size.  For a file that is not a multiple of
2865         * the  page size, the remaining memory is zeroed when mapped, and
2866         * writes to that region are not written out to the file."
2867         */
2868        zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2869        return __block_write_full_page(inode, page, get_block, wbc, handler);
2870}
2871EXPORT_SYMBOL(block_write_full_page_endio);
2872
2873/*
2874 * The generic ->writepage function for buffer-backed address_spaces
2875 */
2876int block_write_full_page(struct page *page, get_block_t *get_block,
2877                        struct writeback_control *wbc)
2878{
2879        return block_write_full_page_endio(page, get_block, wbc,
2880                                           end_buffer_async_write);
2881}
2882EXPORT_SYMBOL(block_write_full_page);
2883
2884sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2885                            get_block_t *get_block)
2886{
2887        struct buffer_head tmp;
2888        struct inode *inode = mapping->host;
2889        tmp.b_state = 0;
2890        tmp.b_blocknr = 0;
2891        tmp.b_size = 1 << inode->i_blkbits;
2892        get_block(inode, block, &tmp, 0);
2893        return tmp.b_blocknr;
2894}
2895EXPORT_SYMBOL(generic_block_bmap);
2896
2897static void end_bio_bh_io_sync(struct bio *bio, int err)
2898{
2899        struct buffer_head *bh = bio->bi_private;
2900
2901        if (err == -EOPNOTSUPP) {
2902                set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2903        }
2904
2905        if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
2906                set_bit(BH_Quiet, &bh->b_state);
2907
2908        bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
2909        bio_put(bio);
2910}
2911
2912int submit_bh(int rw, struct buffer_head * bh)
2913{
2914        struct bio *bio;
2915        int ret = 0;
2916
2917        BUG_ON(!buffer_locked(bh));
2918        BUG_ON(!buffer_mapped(bh));
2919        BUG_ON(!bh->b_end_io);
2920        BUG_ON(buffer_delay(bh));
2921        BUG_ON(buffer_unwritten(bh));
2922
2923        /*
2924         * Only clear out a write error when rewriting
2925         */
2926        if (test_set_buffer_req(bh) && (rw & WRITE))
2927                clear_buffer_write_io_error(bh);
2928
2929        /*
2930         * from here on down, it's all bio -- do the initial mapping,
2931         * submit_bio -> generic_make_request may further map this bio around
2932         */
2933        bio = bio_alloc(GFP_NOIO, 1);
2934
2935        bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
2936        bio->bi_bdev = bh->b_bdev;
2937        bio->bi_io_vec[0].bv_page = bh->b_page;
2938        bio->bi_io_vec[0].bv_len = bh->b_size;
2939        bio->bi_io_vec[0].bv_offset = bh_offset(bh);
2940
2941        bio->bi_vcnt = 1;
2942        bio->bi_idx = 0;
2943        bio->bi_size = bh->b_size;
2944
2945        bio->bi_end_io = end_bio_bh_io_sync;
2946        bio->bi_private = bh;
2947
2948        bio_get(bio);
2949        submit_bio(rw, bio);
2950
2951        if (bio_flagged(bio, BIO_EOPNOTSUPP))
2952                ret = -EOPNOTSUPP;
2953
2954        bio_put(bio);
2955        return ret;
2956}
2957EXPORT_SYMBOL(submit_bh);
2958
2959/**
2960 * ll_rw_block: low-level access to block devices (DEPRECATED)
2961 * @rw: whether to %READ or %WRITE or maybe %READA (readahead)
2962 * @nr: number of &struct buffer_heads in the array
2963 * @bhs: array of pointers to &struct buffer_head
2964 *
2965 * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
2966 * requests an I/O operation on them, either a %READ or a %WRITE.  The third
2967 * %READA option is described in the documentation for generic_make_request()
2968 * which ll_rw_block() calls.
2969 *
2970 * This function drops any buffer that it cannot get a lock on (with the
2971 * BH_Lock state bit), any buffer that appears to be clean when doing a write
2972 * request, and any buffer that appears to be up-to-date when doing read
2973 * request.  Further it marks as clean buffers that are processed for
2974 * writing (the buffer cache won't assume that they are actually clean
2975 * until the buffer gets unlocked).
2976 *
2977 * ll_rw_block sets b_end_io to simple completion handler that marks
2978 * the buffer up-to-date (if approriate), unlocks the buffer and wakes
2979 * any waiters. 
2980 *
2981 * All of the buffers must be for the same device, and must also be a
2982 * multiple of the current approved size for the device.
2983 */
2984void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
2985{
2986        int i;
2987
2988        for (i = 0; i < nr; i++) {
2989                struct buffer_head *bh = bhs[i];
2990
2991                if (!trylock_buffer(bh))
2992                        continue;
2993                if (rw == WRITE) {
2994                        if (test_clear_buffer_dirty(bh)) {
2995                                bh->b_end_io = end_buffer_write_sync;
2996                                get_bh(bh);
2997                                submit_bh(WRITE, bh);
2998                                continue;
2999                        }
3000                } else {
3001                        if (!buffer_uptodate(bh)) {
3002                                bh->b_end_io = end_buffer_read_sync;
3003                                get_bh(bh);
3004                                submit_bh(rw, bh);
3005                                continue;
3006                        }
3007                }
3008                unlock_buffer(bh);
3009        }
3010}
3011EXPORT_SYMBOL(ll_rw_block);
3012
3013void write_dirty_buffer(struct buffer_head *bh, int rw)
3014{
3015        lock_buffer(bh);
3016        if (!test_clear_buffer_dirty(bh)) {
3017                unlock_buffer(bh);
3018                return;
3019        }
3020        bh->b_end_io = end_buffer_write_sync;
3021        get_bh(bh);
3022        submit_bh(rw, bh);
3023}
3024EXPORT_SYMBOL(write_dirty_buffer);
3025
3026/*
3027 * For a data-integrity writeout, we need to wait upon any in-progress I/O
3028 * and then start new I/O and then wait upon it.  The caller must have a ref on
3029 * the buffer_head.
3030 */
3031int __sync_dirty_buffer(struct buffer_head *bh, int rw)
3032{
3033        int ret = 0;
3034
3035        WARN_ON(atomic_read(&bh->b_count) < 1);
3036        lock_buffer(bh);
3037        if (test_clear_buffer_dirty(bh)) {
3038                get_bh(bh);
3039                bh->b_end_io = end_buffer_write_sync;
3040                ret = submit_bh(rw, bh);
3041                wait_on_buffer(bh);
3042                if (!ret && !buffer_uptodate(bh))
3043                        ret = -EIO;
3044        } else {
3045                unlock_buffer(bh);
3046        }
3047        return ret;
3048}
3049EXPORT_SYMBOL(__sync_dirty_buffer);
3050
3051int sync_dirty_buffer(struct buffer_head *bh)
3052{
3053        return __sync_dirty_buffer(bh, WRITE_SYNC);
3054}
3055EXPORT_SYMBOL(sync_dirty_buffer);
3056
3057/*
3058 * try_to_free_buffers() checks if all the buffers on this particular page
3059 * are unused, and releases them if so.
3060 *
3061 * Exclusion against try_to_free_buffers may be obtained by either
3062 * locking the page or by holding its mapping's private_lock.
3063 *
3064 * If the page is dirty but all the buffers are clean then we need to
3065 * be sure to mark the page clean as well.  This is because the page
3066 * may be against a block device, and a later reattachment of buffers
3067 * to a dirty page will set *all* buffers dirty.  Which would corrupt
3068 * filesystem data on the same device.
3069 *
3070 * The same applies to regular filesystem pages: if all the buffers are
3071 * clean then we set the page clean and proceed.  To do that, we require
3072 * total exclusion from __set_page_dirty_buffers().  That is obtained with
3073 * private_lock.
3074 *
3075 * try_to_free_buffers() is non-blocking.
3076 */
3077static inline int buffer_busy(struct buffer_head *bh)
3078{
3079        return atomic_read(&bh->b_count) |
3080                (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
3081}
3082
3083static int
3084drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
3085{
3086        struct buffer_head *head = page_buffers(page);
3087        struct buffer_head *bh;
3088
3089        bh = head;
3090        do {
3091                if (buffer_write_io_error(bh) && page->mapping)
3092                        set_bit(AS_EIO, &page->mapping->flags);
3093                if (buffer_busy(bh))
3094                        goto failed;
3095                bh = bh->b_this_page;
3096        } while (bh != head);
3097
3098        do {
3099                struct buffer_head *next = bh->b_this_page;
3100
3101                if (bh->b_assoc_map)
3102                        __remove_assoc_queue(bh);
3103                bh = next;
3104        } while (bh != head);
3105        *buffers_to_free = head;
3106        __clear_page_buffers(page);
3107        return 1;
3108failed:
3109        return 0;
3110}
3111
3112int try_to_free_buffers(struct page *page)
3113{
3114        struct address_space * const mapping = page->mapping;
3115        struct buffer_head *buffers_to_free = NULL;
3116        int ret = 0;
3117
3118        BUG_ON(!PageLocked(page));
3119        if (PageWriteback(page))
3120                return 0;
3121
3122        if (mapping == NULL) {          /* can this still happen? */
3123                ret = drop_buffers(page, &buffers_to_free);
3124                goto out;
3125        }
3126
3127        spin_lock(&mapping->private_lock);
3128        ret = drop_buffers(page, &buffers_to_free);
3129
3130        /*
3131         * If the filesystem writes its buffers by hand (eg ext3)
3132         * then we can have clean buffers against a dirty page.  We
3133         * clean the page here; otherwise the VM will never notice
3134         * that the filesystem did any IO at all.
3135         *
3136         * Also, during truncate, discard_buffer will have marked all
3137         * the page's buffers clean.  We discover that here and clean
3138         * the page also.
3139         *
3140         * private_lock must be held over this entire operation in order
3141         * to synchronise against __set_page_dirty_buffers and prevent the
3142         * dirty bit from being lost.
3143         */
3144        if (ret)
3145                cancel_dirty_page(page, PAGE_CACHE_SIZE);
3146        spin_unlock(&mapping->private_lock);
3147out:
3148        if (buffers_to_free) {
3149                struct buffer_head *bh = buffers_to_free;
3150
3151                do {
3152                        struct buffer_head *next = bh->b_this_page;
3153                        free_buffer_head(bh);
3154                        bh = next;
3155                } while (bh != buffers_to_free);
3156        }
3157        return ret;
3158}
3159EXPORT_SYMBOL(try_to_free_buffers);
3160
3161/*
3162 * There are no bdflush tunables left.  But distributions are
3163 * still running obsolete flush daemons, so we terminate them here.
3164 *
3165 * Use of bdflush() is deprecated and will be removed in a future kernel.
3166 * The `flush-X' kernel threads fully replace bdflush daemons and this call.
3167 */
3168SYSCALL_DEFINE2(bdflush, int, func, long, data)
3169{
3170        static int msg_count;
3171
3172        if (!capable(CAP_SYS_ADMIN))
3173                return -EPERM;
3174
3175        if (msg_count < 5) {
3176                msg_count++;
3177                printk(KERN_INFO
3178                        "warning: process `%s' used the obsolete bdflush"
3179                        " system call\n", current->comm);
3180                printk(KERN_INFO "Fix your initscripts?\n");
3181        }
3182
3183        if (func == 1)
3184                do_exit(0);
3185        return 0;
3186}
3187
3188/*
3189 * Buffer-head allocation
3190 */
3191static struct kmem_cache *bh_cachep;
3192
3193/*
3194 * Once the number of bh's in the machine exceeds this level, we start
3195 * stripping them in writeback.
3196 */
3197static int max_buffer_heads;
3198
3199int buffer_heads_over_limit;
3200
3201struct bh_accounting {
3202        int nr;                 /* Number of live bh's */
3203        int ratelimit;          /* Limit cacheline bouncing */
3204};
3205
3206static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
3207
3208static void recalc_bh_state(void)
3209{
3210        int i;
3211        int tot = 0;
3212
3213        if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096)
3214                return;
3215        __this_cpu_write(bh_accounting.ratelimit, 0);
3216        for_each_online_cpu(i)
3217                tot += per_cpu(bh_accounting, i).nr;
3218        buffer_heads_over_limit = (tot > max_buffer_heads);
3219}
3220
3221struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
3222{
3223        struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
3224        if (ret) {
3225                INIT_LIST_HEAD(&ret->b_assoc_buffers);
3226                preempt_disable();
3227                __this_cpu_inc(bh_accounting.nr);
3228                recalc_bh_state();
3229                preempt_enable();
3230        }
3231        return ret;
3232}
3233EXPORT_SYMBOL(alloc_buffer_head);
3234
3235void free_buffer_head(struct buffer_head *bh)
3236{
3237        BUG_ON(!list_empty(&bh->b_assoc_buffers));
3238        kmem_cache_free(bh_cachep, bh);
3239        preempt_disable();
3240        __this_cpu_dec(bh_accounting.nr);
3241        recalc_bh_state();
3242        preempt_enable();
3243}
3244EXPORT_SYMBOL(free_buffer_head);
3245
3246static void buffer_exit_cpu(int cpu)
3247{
3248        int i;
3249        struct bh_lru *b = &per_cpu(bh_lrus, cpu);
3250
3251        for (i = 0; i < BH_LRU_SIZE; i++) {
3252                brelse(b->bhs[i]);
3253                b->bhs[i] = NULL;
3254        }
3255        this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
3256        per_cpu(bh_accounting, cpu).nr = 0;
3257}
3258
3259static int buffer_cpu_notify(struct notifier_block *self,
3260                              unsigned long action, void *hcpu)
3261{
3262        if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
3263                buffer_exit_cpu((unsigned long)hcpu);
3264        return NOTIFY_OK;
3265}
3266
3267/**
3268 * bh_uptodate_or_lock - Test whether the buffer is uptodate
3269 * @bh: struct buffer_head
3270 *
3271 * Return true if the buffer is up-to-date and false,
3272 * with the buffer locked, if not.
3273 */
3274int bh_uptodate_or_lock(struct buffer_head *bh)
3275{
3276        if (!buffer_uptodate(bh)) {
3277                lock_buffer(bh);
3278                if (!buffer_uptodate(bh))
3279                        return 0;
3280                unlock_buffer(bh);
3281        }
3282        return 1;
3283}
3284EXPORT_SYMBOL(bh_uptodate_or_lock);
3285
3286/**
3287 * bh_submit_read - Submit a locked buffer for reading
3288 * @bh: struct buffer_head
3289 *
3290 * Returns zero on success and -EIO on error.
3291 */
3292int bh_submit_read(struct buffer_head *bh)
3293{
3294        BUG_ON(!buffer_locked(bh));
3295
3296        if (buffer_uptodate(bh)) {
3297                unlock_buffer(bh);
3298                return 0;
3299        }
3300
3301        get_bh(bh);
3302        bh->b_end_io = end_buffer_read_sync;
3303        submit_bh(READ, bh);
3304        wait_on_buffer(bh);
3305        if (buffer_uptodate(bh))
3306                return 0;
3307        return -EIO;
3308}
3309EXPORT_SYMBOL(bh_submit_read);
3310
3311void __init buffer_init(void)
3312{
3313        int nrpages;
3314
3315        bh_cachep = kmem_cache_create("buffer_head",
3316                        sizeof(struct buffer_head), 0,
3317                                (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
3318                                SLAB_MEM_SPREAD),
3319                                NULL);
3320
3321        /*
3322         * Limit the bh occupancy to 10% of ZONE_NORMAL
3323         */
3324        nrpages = (nr_free_buffer_pages() * 10) / 100;
3325        max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3326        hotcpu_notifier(buffer_cpu_notify, 0);
3327}
3328