linux/fs/buffer.c
<<
>>
Prefs
   1/*
   2 *  linux/fs/buffer.c
   3 *
   4 *  Copyright (C) 1991, 1992, 2002  Linus Torvalds
   5 */
   6
   7/*
   8 * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
   9 *
  10 * Removed a lot of unnecessary code and simplified things now that
  11 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
  12 *
  13 * Speed up hash, lru, and free list operations.  Use gfp() for allocating
  14 * hash table, use SLAB cache for buffer heads. SMP threading.  -DaveM
  15 *
  16 * Added 32k buffer block sizes - these are required older ARM systems. - RMK
  17 *
  18 * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
  19 */
  20
  21#include <linux/kernel.h>
  22#include <linux/syscalls.h>
  23#include <linux/fs.h>
  24#include <linux/mm.h>
  25#include <linux/percpu.h>
  26#include <linux/slab.h>
  27#include <linux/capability.h>
  28#include <linux/blkdev.h>
  29#include <linux/file.h>
  30#include <linux/quotaops.h>
  31#include <linux/highmem.h>
  32#include <linux/export.h>
  33#include <linux/writeback.h>
  34#include <linux/hash.h>
  35#include <linux/suspend.h>
  36#include <linux/buffer_head.h>
  37#include <linux/task_io_accounting_ops.h>
  38#include <linux/bio.h>
  39#include <linux/notifier.h>
  40#include <linux/cpu.h>
  41#include <linux/bitops.h>
  42#include <linux/mpage.h>
  43#include <linux/bit_spinlock.h>
  44#include <trace/events/block.h>
  45
  46static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
  47
  48#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
  49
  50void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
  51{
  52        bh->b_end_io = handler;
  53        bh->b_private = private;
  54}
  55EXPORT_SYMBOL(init_buffer);
  56
  57inline void touch_buffer(struct buffer_head *bh)
  58{
  59        trace_block_touch_buffer(bh);
  60        mark_page_accessed(bh->b_page);
  61}
  62EXPORT_SYMBOL(touch_buffer);
  63
  64void __lock_buffer(struct buffer_head *bh)
  65{
  66        wait_on_bit_lock_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
  67}
  68EXPORT_SYMBOL(__lock_buffer);
  69
  70void unlock_buffer(struct buffer_head *bh)
  71{
  72        clear_bit_unlock(BH_Lock, &bh->b_state);
  73        smp_mb__after_atomic();
  74        wake_up_bit(&bh->b_state, BH_Lock);
  75}
  76EXPORT_SYMBOL(unlock_buffer);
  77
  78/*
  79 * Returns if the page has dirty or writeback buffers. If all the buffers
  80 * are unlocked and clean then the PageDirty information is stale. If
  81 * any of the pages are locked, it is assumed they are locked for IO.
  82 */
  83void buffer_check_dirty_writeback(struct page *page,
  84                                     bool *dirty, bool *writeback)
  85{
  86        struct buffer_head *head, *bh;
  87        *dirty = false;
  88        *writeback = false;
  89
  90        BUG_ON(!PageLocked(page));
  91
  92        if (!page_has_buffers(page))
  93                return;
  94
  95        if (PageWriteback(page))
  96                *writeback = true;
  97
  98        head = page_buffers(page);
  99        bh = head;
 100        do {
 101                if (buffer_locked(bh))
 102                        *writeback = true;
 103
 104                if (buffer_dirty(bh))
 105                        *dirty = true;
 106
 107                bh = bh->b_this_page;
 108        } while (bh != head);
 109}
 110EXPORT_SYMBOL(buffer_check_dirty_writeback);
 111
 112/*
 113 * Block until a buffer comes unlocked.  This doesn't stop it
 114 * from becoming locked again - you have to lock it yourself
 115 * if you want to preserve its state.
 116 */
 117void __wait_on_buffer(struct buffer_head * bh)
 118{
 119        wait_on_bit_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
 120}
 121EXPORT_SYMBOL(__wait_on_buffer);
 122
 123static void
 124__clear_page_buffers(struct page *page)
 125{
 126        ClearPagePrivate(page);
 127        set_page_private(page, 0);
 128        page_cache_release(page);
 129}
 130
 131static void buffer_io_error(struct buffer_head *bh, char *msg)
 132{
 133        char b[BDEVNAME_SIZE];
 134
 135        if (!test_bit(BH_Quiet, &bh->b_state))
 136                printk_ratelimited(KERN_ERR
 137                        "Buffer I/O error on dev %s, logical block %llu%s\n",
 138                        bdevname(bh->b_bdev, b),
 139                        (unsigned long long)bh->b_blocknr, msg);
 140}
 141
 142/*
 143 * End-of-IO handler helper function which does not touch the bh after
 144 * unlocking it.
 145 * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
 146 * a race there is benign: unlock_buffer() only use the bh's address for
 147 * hashing after unlocking the buffer, so it doesn't actually touch the bh
 148 * itself.
 149 */
 150static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
 151{
 152        if (uptodate) {
 153                set_buffer_uptodate(bh);
 154        } else {
 155                /* This happens, due to failed READA attempts. */
 156                clear_buffer_uptodate(bh);
 157        }
 158        unlock_buffer(bh);
 159}
 160
 161/*
 162 * Default synchronous end-of-IO handler..  Just mark it up-to-date and
 163 * unlock the buffer. This is what ll_rw_block uses too.
 164 */
 165void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
 166{
 167        __end_buffer_read_notouch(bh, uptodate);
 168        put_bh(bh);
 169}
 170EXPORT_SYMBOL(end_buffer_read_sync);
 171
 172void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 173{
 174        if (uptodate) {
 175                set_buffer_uptodate(bh);
 176        } else {
 177                buffer_io_error(bh, ", lost sync page write");
 178                set_buffer_write_io_error(bh);
 179                clear_buffer_uptodate(bh);
 180        }
 181        unlock_buffer(bh);
 182        put_bh(bh);
 183}
 184EXPORT_SYMBOL(end_buffer_write_sync);
 185
 186/*
 187 * Various filesystems appear to want __find_get_block to be non-blocking.
 188 * But it's the page lock which protects the buffers.  To get around this,
 189 * we get exclusion from try_to_free_buffers with the blockdev mapping's
 190 * private_lock.
 191 *
 192 * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
 193 * may be quite high.  This code could TryLock the page, and if that
 194 * succeeds, there is no need to take private_lock. (But if
 195 * private_lock is contended then so is mapping->tree_lock).
 196 */
 197static struct buffer_head *
 198__find_get_block_slow(struct block_device *bdev, sector_t block)
 199{
 200        struct inode *bd_inode = bdev->bd_inode;
 201        struct address_space *bd_mapping = bd_inode->i_mapping;
 202        struct buffer_head *ret = NULL;
 203        pgoff_t index;
 204        struct buffer_head *bh;
 205        struct buffer_head *head;
 206        struct page *page;
 207        int all_mapped = 1;
 208
 209        index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
 210        page = find_get_page_flags(bd_mapping, index, FGP_ACCESSED);
 211        if (!page)
 212                goto out;
 213
 214        spin_lock(&bd_mapping->private_lock);
 215        if (!page_has_buffers(page))
 216                goto out_unlock;
 217        head = page_buffers(page);
 218        bh = head;
 219        do {
 220                if (!buffer_mapped(bh))
 221                        all_mapped = 0;
 222                else if (bh->b_blocknr == block) {
 223                        ret = bh;
 224                        get_bh(bh);
 225                        goto out_unlock;
 226                }
 227                bh = bh->b_this_page;
 228        } while (bh != head);
 229
 230        /* we might be here because some of the buffers on this page are
 231         * not mapped.  This is due to various races between
 232         * file io on the block device and getblk.  It gets dealt with
 233         * elsewhere, don't buffer_error if we had some unmapped buffers
 234         */
 235        if (all_mapped) {
 236                char b[BDEVNAME_SIZE];
 237
 238                printk("__find_get_block_slow() failed. "
 239                        "block=%llu, b_blocknr=%llu\n",
 240                        (unsigned long long)block,
 241                        (unsigned long long)bh->b_blocknr);
 242                printk("b_state=0x%08lx, b_size=%zu\n",
 243                        bh->b_state, bh->b_size);
 244                printk("device %s blocksize: %d\n", bdevname(bdev, b),
 245                        1 << bd_inode->i_blkbits);
 246        }
 247out_unlock:
 248        spin_unlock(&bd_mapping->private_lock);
 249        page_cache_release(page);
 250out:
 251        return ret;
 252}
 253
 254/*
 255 * Kick the writeback threads then try to free up some ZONE_NORMAL memory.
 256 */
 257static void free_more_memory(void)
 258{
 259        struct zone *zone;
 260        int nid;
 261
 262        wakeup_flusher_threads(1024, WB_REASON_FREE_MORE_MEM);
 263        yield();
 264
 265        for_each_online_node(nid) {
 266                (void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
 267                                                gfp_zone(GFP_NOFS), NULL,
 268                                                &zone);
 269                if (zone)
 270                        try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
 271                                                GFP_NOFS, NULL);
 272        }
 273}
 274
 275/*
 276 * I/O completion handler for block_read_full_page() - pages
 277 * which come unlocked at the end of I/O.
 278 */
 279static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 280{
 281        unsigned long flags;
 282        struct buffer_head *first;
 283        struct buffer_head *tmp;
 284        struct page *page;
 285        int page_uptodate = 1;
 286
 287        BUG_ON(!buffer_async_read(bh));
 288
 289        page = bh->b_page;
 290        if (uptodate) {
 291                set_buffer_uptodate(bh);
 292        } else {
 293                clear_buffer_uptodate(bh);
 294                buffer_io_error(bh, ", async page read");
 295                SetPageError(page);
 296        }
 297
 298        /*
 299         * Be _very_ careful from here on. Bad things can happen if
 300         * two buffer heads end IO at almost the same time and both
 301         * decide that the page is now completely done.
 302         */
 303        first = page_buffers(page);
 304        local_irq_save(flags);
 305        bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
 306        clear_buffer_async_read(bh);
 307        unlock_buffer(bh);
 308        tmp = bh;
 309        do {
 310                if (!buffer_uptodate(tmp))
 311                        page_uptodate = 0;
 312                if (buffer_async_read(tmp)) {
 313                        BUG_ON(!buffer_locked(tmp));
 314                        goto still_busy;
 315                }
 316                tmp = tmp->b_this_page;
 317        } while (tmp != bh);
 318        bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 319        local_irq_restore(flags);
 320
 321        /*
 322         * If none of the buffers had errors and they are all
 323         * uptodate then we can set the page uptodate.
 324         */
 325        if (page_uptodate && !PageError(page))
 326                SetPageUptodate(page);
 327        unlock_page(page);
 328        return;
 329
 330still_busy:
 331        bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 332        local_irq_restore(flags);
 333        return;
 334}
 335
 336/*
 337 * Completion handler for block_write_full_page() - pages which are unlocked
 338 * during I/O, and which have PageWriteback cleared upon I/O completion.
 339 */
 340void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 341{
 342        unsigned long flags;
 343        struct buffer_head *first;
 344        struct buffer_head *tmp;
 345        struct page *page;
 346
 347        BUG_ON(!buffer_async_write(bh));
 348
 349        page = bh->b_page;
 350        if (uptodate) {
 351                set_buffer_uptodate(bh);
 352        } else {
 353                buffer_io_error(bh, ", lost async page write");
 354                set_bit(AS_EIO, &page->mapping->flags);
 355                set_buffer_write_io_error(bh);
 356                clear_buffer_uptodate(bh);
 357                SetPageError(page);
 358        }
 359
 360        first = page_buffers(page);
 361        local_irq_save(flags);
 362        bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
 363
 364        clear_buffer_async_write(bh);
 365        unlock_buffer(bh);
 366        tmp = bh->b_this_page;
 367        while (tmp != bh) {
 368                if (buffer_async_write(tmp)) {
 369                        BUG_ON(!buffer_locked(tmp));
 370                        goto still_busy;
 371                }
 372                tmp = tmp->b_this_page;
 373        }
 374        bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 375        local_irq_restore(flags);
 376        end_page_writeback(page);
 377        return;
 378
 379still_busy:
 380        bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 381        local_irq_restore(flags);
 382        return;
 383}
 384EXPORT_SYMBOL(end_buffer_async_write);
 385
 386/*
 387 * If a page's buffers are under async readin (end_buffer_async_read
 388 * completion) then there is a possibility that another thread of
 389 * control could lock one of the buffers after it has completed
 390 * but while some of the other buffers have not completed.  This
 391 * locked buffer would confuse end_buffer_async_read() into not unlocking
 392 * the page.  So the absence of BH_Async_Read tells end_buffer_async_read()
 393 * that this buffer is not under async I/O.
 394 *
 395 * The page comes unlocked when it has no locked buffer_async buffers
 396 * left.
 397 *
 398 * PageLocked prevents anyone starting new async I/O reads any of
 399 * the buffers.
 400 *
 401 * PageWriteback is used to prevent simultaneous writeout of the same
 402 * page.
 403 *
 404 * PageLocked prevents anyone from starting writeback of a page which is
 405 * under read I/O (PageWriteback is only ever set against a locked page).
 406 */
 407static void mark_buffer_async_read(struct buffer_head *bh)
 408{
 409        bh->b_end_io = end_buffer_async_read;
 410        set_buffer_async_read(bh);
 411}
 412
 413static void mark_buffer_async_write_endio(struct buffer_head *bh,
 414                                          bh_end_io_t *handler)
 415{
 416        bh->b_end_io = handler;
 417        set_buffer_async_write(bh);
 418}
 419
 420void mark_buffer_async_write(struct buffer_head *bh)
 421{
 422        mark_buffer_async_write_endio(bh, end_buffer_async_write);
 423}
 424EXPORT_SYMBOL(mark_buffer_async_write);
 425
 426
 427/*
 428 * fs/buffer.c contains helper functions for buffer-backed address space's
 429 * fsync functions.  A common requirement for buffer-based filesystems is
 430 * that certain data from the backing blockdev needs to be written out for
 431 * a successful fsync().  For example, ext2 indirect blocks need to be
 432 * written back and waited upon before fsync() returns.
 433 *
 434 * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
 435 * inode_has_buffers() and invalidate_inode_buffers() are provided for the
 436 * management of a list of dependent buffers at ->i_mapping->private_list.
 437 *
 438 * Locking is a little subtle: try_to_free_buffers() will remove buffers
 439 * from their controlling inode's queue when they are being freed.  But
 440 * try_to_free_buffers() will be operating against the *blockdev* mapping
 441 * at the time, not against the S_ISREG file which depends on those buffers.
 442 * So the locking for private_list is via the private_lock in the address_space
 443 * which backs the buffers.  Which is different from the address_space 
 444 * against which the buffers are listed.  So for a particular address_space,
 445 * mapping->private_lock does *not* protect mapping->private_list!  In fact,
 446 * mapping->private_list will always be protected by the backing blockdev's
 447 * ->private_lock.
 448 *
 449 * Which introduces a requirement: all buffers on an address_space's
 450 * ->private_list must be from the same address_space: the blockdev's.
 451 *
 452 * address_spaces which do not place buffers at ->private_list via these
 453 * utility functions are free to use private_lock and private_list for
 454 * whatever they want.  The only requirement is that list_empty(private_list)
 455 * be true at clear_inode() time.
 456 *
 457 * FIXME: clear_inode should not call invalidate_inode_buffers().  The
 458 * filesystems should do that.  invalidate_inode_buffers() should just go
 459 * BUG_ON(!list_empty).
 460 *
 461 * FIXME: mark_buffer_dirty_inode() is a data-plane operation.  It should
 462 * take an address_space, not an inode.  And it should be called
 463 * mark_buffer_dirty_fsync() to clearly define why those buffers are being
 464 * queued up.
 465 *
 466 * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
 467 * list if it is already on a list.  Because if the buffer is on a list,
 468 * it *must* already be on the right one.  If not, the filesystem is being
 469 * silly.  This will save a ton of locking.  But first we have to ensure
 470 * that buffers are taken *off* the old inode's list when they are freed
 471 * (presumably in truncate).  That requires careful auditing of all
 472 * filesystems (do it inside bforget()).  It could also be done by bringing
 473 * b_inode back.
 474 */
 475
 476/*
 477 * The buffer's backing address_space's private_lock must be held
 478 */
 479static void __remove_assoc_queue(struct buffer_head *bh)
 480{
 481        list_del_init(&bh->b_assoc_buffers);
 482        WARN_ON(!bh->b_assoc_map);
 483        if (buffer_write_io_error(bh))
 484                set_bit(AS_EIO, &bh->b_assoc_map->flags);
 485        bh->b_assoc_map = NULL;
 486}
 487
 488int inode_has_buffers(struct inode *inode)
 489{
 490        return !list_empty(&inode->i_data.private_list);
 491}
 492
 493/*
 494 * osync is designed to support O_SYNC io.  It waits synchronously for
 495 * all already-submitted IO to complete, but does not queue any new
 496 * writes to the disk.
 497 *
 498 * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
 499 * you dirty the buffers, and then use osync_inode_buffers to wait for
 500 * completion.  Any other dirty buffers which are not yet queued for
 501 * write will not be flushed to disk by the osync.
 502 */
 503static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
 504{
 505        struct buffer_head *bh;
 506        struct list_head *p;
 507        int err = 0;
 508
 509        spin_lock(lock);
 510repeat:
 511        list_for_each_prev(p, list) {
 512                bh = BH_ENTRY(p);
 513                if (buffer_locked(bh)) {
 514                        get_bh(bh);
 515                        spin_unlock(lock);
 516                        wait_on_buffer(bh);
 517                        if (!buffer_uptodate(bh))
 518                                err = -EIO;
 519                        brelse(bh);
 520                        spin_lock(lock);
 521                        goto repeat;
 522                }
 523        }
 524        spin_unlock(lock);
 525        return err;
 526}
 527
 528static void do_thaw_one(struct super_block *sb, void *unused)
 529{
 530        char b[BDEVNAME_SIZE];
 531        while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
 532                printk(KERN_WARNING "Emergency Thaw on %s\n",
 533                       bdevname(sb->s_bdev, b));
 534}
 535
 536static void do_thaw_all(struct work_struct *work)
 537{
 538        iterate_supers(do_thaw_one, NULL);
 539        kfree(work);
 540        printk(KERN_WARNING "Emergency Thaw complete\n");
 541}
 542
 543/**
 544 * emergency_thaw_all -- forcibly thaw every frozen filesystem
 545 *
 546 * Used for emergency unfreeze of all filesystems via SysRq
 547 */
 548void emergency_thaw_all(void)
 549{
 550        struct work_struct *work;
 551
 552        work = kmalloc(sizeof(*work), GFP_ATOMIC);
 553        if (work) {
 554                INIT_WORK(work, do_thaw_all);
 555                schedule_work(work);
 556        }
 557}
 558
 559/**
 560 * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
 561 * @mapping: the mapping which wants those buffers written
 562 *
 563 * Starts I/O against the buffers at mapping->private_list, and waits upon
 564 * that I/O.
 565 *
 566 * Basically, this is a convenience function for fsync().
 567 * @mapping is a file or directory which needs those buffers to be written for
 568 * a successful fsync().
 569 */
 570int sync_mapping_buffers(struct address_space *mapping)
 571{
 572        struct address_space *buffer_mapping = mapping->private_data;
 573
 574        if (buffer_mapping == NULL || list_empty(&mapping->private_list))
 575                return 0;
 576
 577        return fsync_buffers_list(&buffer_mapping->private_lock,
 578                                        &mapping->private_list);
 579}
 580EXPORT_SYMBOL(sync_mapping_buffers);
 581
 582/*
 583 * Called when we've recently written block `bblock', and it is known that
 584 * `bblock' was for a buffer_boundary() buffer.  This means that the block at
 585 * `bblock + 1' is probably a dirty indirect block.  Hunt it down and, if it's
 586 * dirty, schedule it for IO.  So that indirects merge nicely with their data.
 587 */
 588void write_boundary_block(struct block_device *bdev,
 589                        sector_t bblock, unsigned blocksize)
 590{
 591        struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
 592        if (bh) {
 593                if (buffer_dirty(bh))
 594                        ll_rw_block(WRITE, 1, &bh);
 595                put_bh(bh);
 596        }
 597}
 598
 599void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
 600{
 601        struct address_space *mapping = inode->i_mapping;
 602        struct address_space *buffer_mapping = bh->b_page->mapping;
 603
 604        mark_buffer_dirty(bh);
 605        if (!mapping->private_data) {
 606                mapping->private_data = buffer_mapping;
 607        } else {
 608                BUG_ON(mapping->private_data != buffer_mapping);
 609        }
 610        if (!bh->b_assoc_map) {
 611                spin_lock(&buffer_mapping->private_lock);
 612                list_move_tail(&bh->b_assoc_buffers,
 613                                &mapping->private_list);
 614                bh->b_assoc_map = mapping;
 615                spin_unlock(&buffer_mapping->private_lock);
 616        }
 617}
 618EXPORT_SYMBOL(mark_buffer_dirty_inode);
 619
 620/*
 621 * Mark the page dirty, and set it dirty in the radix tree, and mark the inode
 622 * dirty.
 623 *
 624 * If warn is true, then emit a warning if the page is not uptodate and has
 625 * not been truncated.
 626 */
 627static void __set_page_dirty(struct page *page,
 628                struct address_space *mapping, int warn)
 629{
 630        unsigned long flags;
 631
 632        spin_lock_irqsave(&mapping->tree_lock, flags);
 633        if (page->mapping) {    /* Race with truncate? */
 634                WARN_ON_ONCE(warn && !PageUptodate(page));
 635                account_page_dirtied(page, mapping);
 636                radix_tree_tag_set(&mapping->page_tree,
 637                                page_index(page), PAGECACHE_TAG_DIRTY);
 638        }
 639        spin_unlock_irqrestore(&mapping->tree_lock, flags);
 640        __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 641}
 642
 643/*
 644 * Add a page to the dirty page list.
 645 *
 646 * It is a sad fact of life that this function is called from several places
 647 * deeply under spinlocking.  It may not sleep.
 648 *
 649 * If the page has buffers, the uptodate buffers are set dirty, to preserve
 650 * dirty-state coherency between the page and the buffers.  It the page does
 651 * not have buffers then when they are later attached they will all be set
 652 * dirty.
 653 *
 654 * The buffers are dirtied before the page is dirtied.  There's a small race
 655 * window in which a writepage caller may see the page cleanness but not the
 656 * buffer dirtiness.  That's fine.  If this code were to set the page dirty
 657 * before the buffers, a concurrent writepage caller could clear the page dirty
 658 * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
 659 * page on the dirty page list.
 660 *
 661 * We use private_lock to lock against try_to_free_buffers while using the
 662 * page's buffer list.  Also use this to protect against clean buffers being
 663 * added to the page after it was set dirty.
 664 *
 665 * FIXME: may need to call ->reservepage here as well.  That's rather up to the
 666 * address_space though.
 667 */
 668int __set_page_dirty_buffers(struct page *page)
 669{
 670        int newly_dirty;
 671        struct address_space *mapping = page_mapping(page);
 672
 673        if (unlikely(!mapping))
 674                return !TestSetPageDirty(page);
 675
 676        spin_lock(&mapping->private_lock);
 677        if (page_has_buffers(page)) {
 678                struct buffer_head *head = page_buffers(page);
 679                struct buffer_head *bh = head;
 680
 681                do {
 682                        set_buffer_dirty(bh);
 683                        bh = bh->b_this_page;
 684                } while (bh != head);
 685        }
 686        newly_dirty = !TestSetPageDirty(page);
 687        spin_unlock(&mapping->private_lock);
 688
 689        if (newly_dirty)
 690                __set_page_dirty(page, mapping, 1);
 691        return newly_dirty;
 692}
 693EXPORT_SYMBOL(__set_page_dirty_buffers);
 694
 695/*
 696 * Write out and wait upon a list of buffers.
 697 *
 698 * We have conflicting pressures: we want to make sure that all
 699 * initially dirty buffers get waited on, but that any subsequently
 700 * dirtied buffers don't.  After all, we don't want fsync to last
 701 * forever if somebody is actively writing to the file.
 702 *
 703 * Do this in two main stages: first we copy dirty buffers to a
 704 * temporary inode list, queueing the writes as we go.  Then we clean
 705 * up, waiting for those writes to complete.
 706 * 
 707 * During this second stage, any subsequent updates to the file may end
 708 * up refiling the buffer on the original inode's dirty list again, so
 709 * there is a chance we will end up with a buffer queued for write but
 710 * not yet completed on that list.  So, as a final cleanup we go through
 711 * the osync code to catch these locked, dirty buffers without requeuing
 712 * any newly dirty buffers for write.
 713 */
 714static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 715{
 716        struct buffer_head *bh;
 717        struct list_head tmp;
 718        struct address_space *mapping;
 719        int err = 0, err2;
 720        struct blk_plug plug;
 721
 722        INIT_LIST_HEAD(&tmp);
 723        blk_start_plug(&plug);
 724
 725        spin_lock(lock);
 726        while (!list_empty(list)) {
 727                bh = BH_ENTRY(list->next);
 728                mapping = bh->b_assoc_map;
 729                __remove_assoc_queue(bh);
 730                /* Avoid race with mark_buffer_dirty_inode() which does
 731                 * a lockless check and we rely on seeing the dirty bit */
 732                smp_mb();
 733                if (buffer_dirty(bh) || buffer_locked(bh)) {
 734                        list_add(&bh->b_assoc_buffers, &tmp);
 735                        bh->b_assoc_map = mapping;
 736                        if (buffer_dirty(bh)) {
 737                                get_bh(bh);
 738                                spin_unlock(lock);
 739                                /*
 740                                 * Ensure any pending I/O completes so that
 741                                 * write_dirty_buffer() actually writes the
 742                                 * current contents - it is a noop if I/O is
 743                                 * still in flight on potentially older
 744                                 * contents.
 745                                 */
 746                                write_dirty_buffer(bh, WRITE_SYNC);
 747
 748                                /*
 749                                 * Kick off IO for the previous mapping. Note
 750                                 * that we will not run the very last mapping,
 751                                 * wait_on_buffer() will do that for us
 752                                 * through sync_buffer().
 753                                 */
 754                                brelse(bh);
 755                                spin_lock(lock);
 756                        }
 757                }
 758        }
 759
 760        spin_unlock(lock);
 761        blk_finish_plug(&plug);
 762        spin_lock(lock);
 763
 764        while (!list_empty(&tmp)) {
 765                bh = BH_ENTRY(tmp.prev);
 766                get_bh(bh);
 767                mapping = bh->b_assoc_map;
 768                __remove_assoc_queue(bh);
 769                /* Avoid race with mark_buffer_dirty_inode() which does
 770                 * a lockless check and we rely on seeing the dirty bit */
 771                smp_mb();
 772                if (buffer_dirty(bh)) {
 773                        list_add(&bh->b_assoc_buffers,
 774                                 &mapping->private_list);
 775                        bh->b_assoc_map = mapping;
 776                }
 777                spin_unlock(lock);
 778                wait_on_buffer(bh);
 779                if (!buffer_uptodate(bh))
 780                        err = -EIO;
 781                brelse(bh);
 782                spin_lock(lock);
 783        }
 784        
 785        spin_unlock(lock);
 786        err2 = osync_buffers_list(lock, list);
 787        if (err)
 788                return err;
 789        else
 790                return err2;
 791}
 792
 793/*
 794 * Invalidate any and all dirty buffers on a given inode.  We are
 795 * probably unmounting the fs, but that doesn't mean we have already
 796 * done a sync().  Just drop the buffers from the inode list.
 797 *
 798 * NOTE: we take the inode's blockdev's mapping's private_lock.  Which
 799 * assumes that all the buffers are against the blockdev.  Not true
 800 * for reiserfs.
 801 */
 802void invalidate_inode_buffers(struct inode *inode)
 803{
 804        if (inode_has_buffers(inode)) {
 805                struct address_space *mapping = &inode->i_data;
 806                struct list_head *list = &mapping->private_list;
 807                struct address_space *buffer_mapping = mapping->private_data;
 808
 809                spin_lock(&buffer_mapping->private_lock);
 810                while (!list_empty(list))
 811                        __remove_assoc_queue(BH_ENTRY(list->next));
 812                spin_unlock(&buffer_mapping->private_lock);
 813        }
 814}
 815EXPORT_SYMBOL(invalidate_inode_buffers);
 816
 817/*
 818 * Remove any clean buffers from the inode's buffer list.  This is called
 819 * when we're trying to free the inode itself.  Those buffers can pin it.
 820 *
 821 * Returns true if all buffers were removed.
 822 */
 823int remove_inode_buffers(struct inode *inode)
 824{
 825        int ret = 1;
 826
 827        if (inode_has_buffers(inode)) {
 828                struct address_space *mapping = &inode->i_data;
 829                struct list_head *list = &mapping->private_list;
 830                struct address_space *buffer_mapping = mapping->private_data;
 831
 832                spin_lock(&buffer_mapping->private_lock);
 833                while (!list_empty(list)) {
 834                        struct buffer_head *bh = BH_ENTRY(list->next);
 835                        if (buffer_dirty(bh)) {
 836                                ret = 0;
 837                                break;
 838                        }
 839                        __remove_assoc_queue(bh);
 840                }
 841                spin_unlock(&buffer_mapping->private_lock);
 842        }
 843        return ret;
 844}
 845
 846/*
 847 * Create the appropriate buffers when given a page for data area and
 848 * the size of each buffer.. Use the bh->b_this_page linked list to
 849 * follow the buffers created.  Return NULL if unable to create more
 850 * buffers.
 851 *
 852 * The retry flag is used to differentiate async IO (paging, swapping)
 853 * which may not fail from ordinary buffer allocations.
 854 */
 855struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
 856                int retry)
 857{
 858        struct buffer_head *bh, *head;
 859        long offset;
 860
 861try_again:
 862        head = NULL;
 863        offset = PAGE_SIZE;
 864        while ((offset -= size) >= 0) {
 865                bh = alloc_buffer_head(GFP_NOFS);
 866                if (!bh)
 867                        goto no_grow;
 868
 869                bh->b_this_page = head;
 870                bh->b_blocknr = -1;
 871                head = bh;
 872
 873                bh->b_size = size;
 874
 875                /* Link the buffer to its page */
 876                set_bh_page(bh, page, offset);
 877        }
 878        return head;
 879/*
 880 * In case anything failed, we just free everything we got.
 881 */
 882no_grow:
 883        if (head) {
 884                do {
 885                        bh = head;
 886                        head = head->b_this_page;
 887                        free_buffer_head(bh);
 888                } while (head);
 889        }
 890
 891        /*
 892         * Return failure for non-async IO requests.  Async IO requests
 893         * are not allowed to fail, so we have to wait until buffer heads
 894         * become available.  But we don't want tasks sleeping with 
 895         * partially complete buffers, so all were released above.
 896         */
 897        if (!retry)
 898                return NULL;
 899
 900        /* We're _really_ low on memory. Now we just
 901         * wait for old buffer heads to become free due to
 902         * finishing IO.  Since this is an async request and
 903         * the reserve list is empty, we're sure there are 
 904         * async buffer heads in use.
 905         */
 906        free_more_memory();
 907        goto try_again;
 908}
 909EXPORT_SYMBOL_GPL(alloc_page_buffers);
 910
 911static inline void
 912link_dev_buffers(struct page *page, struct buffer_head *head)
 913{
 914        struct buffer_head *bh, *tail;
 915
 916        bh = head;
 917        do {
 918                tail = bh;
 919                bh = bh->b_this_page;
 920        } while (bh);
 921        tail->b_this_page = head;
 922        attach_page_buffers(page, head);
 923}
 924
 925static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size)
 926{
 927        sector_t retval = ~((sector_t)0);
 928        loff_t sz = i_size_read(bdev->bd_inode);
 929
 930        if (sz) {
 931                unsigned int sizebits = blksize_bits(size);
 932                retval = (sz >> sizebits);
 933        }
 934        return retval;
 935}
 936
 937/*
 938 * Initialise the state of a blockdev page's buffers.
 939 */ 
 940static sector_t
 941init_page_buffers(struct page *page, struct block_device *bdev,
 942                        sector_t block, int size)
 943{
 944        struct buffer_head *head = page_buffers(page);
 945        struct buffer_head *bh = head;
 946        int uptodate = PageUptodate(page);
 947        sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode), size);
 948
 949        do {
 950                if (!buffer_mapped(bh)) {
 951                        init_buffer(bh, NULL, NULL);
 952                        bh->b_bdev = bdev;
 953                        bh->b_blocknr = block;
 954                        if (uptodate)
 955                                set_buffer_uptodate(bh);
 956                        if (block < end_block)
 957                                set_buffer_mapped(bh);
 958                }
 959                block++;
 960                bh = bh->b_this_page;
 961        } while (bh != head);
 962
 963        /*
 964         * Caller needs to validate requested block against end of device.
 965         */
 966        return end_block;
 967}
 968
 969/*
 970 * Create the page-cache page that contains the requested block.
 971 *
 972 * This is used purely for blockdev mappings.
 973 */
 974static int
 975grow_dev_page(struct block_device *bdev, sector_t block,
 976              pgoff_t index, int size, int sizebits, gfp_t gfp)
 977{
 978        struct inode *inode = bdev->bd_inode;
 979        struct page *page;
 980        struct buffer_head *bh;
 981        sector_t end_block;
 982        int ret = 0;            /* Will call free_more_memory() */
 983        gfp_t gfp_mask;
 984
 985        gfp_mask = (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS) | gfp;
 986
 987        /*
 988         * XXX: __getblk_slow() can not really deal with failure and
 989         * will endlessly loop on improvised global reclaim.  Prefer
 990         * looping in the allocator rather than here, at least that
 991         * code knows what it's doing.
 992         */
 993        gfp_mask |= __GFP_NOFAIL;
 994
 995        page = find_or_create_page(inode->i_mapping, index, gfp_mask);
 996        if (!page)
 997                return ret;
 998
 999        BUG_ON(!PageLocked(page));
1000
1001        if (page_has_buffers(page)) {
1002                bh = page_buffers(page);
1003                if (bh->b_size == size) {
1004                        end_block = init_page_buffers(page, bdev,
1005                                                (sector_t)index << sizebits,
1006                                                size);
1007                        goto done;
1008                }
1009                if (!try_to_free_buffers(page))
1010                        goto failed;
1011        }
1012
1013        /*
1014         * Allocate some buffers for this page
1015         */
1016        bh = alloc_page_buffers(page, size, 0);
1017        if (!bh)
1018                goto failed;
1019
1020        /*
1021         * Link the page to the buffers and initialise them.  Take the
1022         * lock to be atomic wrt __find_get_block(), which does not
1023         * run under the page lock.
1024         */
1025        spin_lock(&inode->i_mapping->private_lock);
1026        link_dev_buffers(page, bh);
1027        end_block = init_page_buffers(page, bdev, (sector_t)index << sizebits,
1028                        size);
1029        spin_unlock(&inode->i_mapping->private_lock);
1030done:
1031        ret = (block < end_block) ? 1 : -ENXIO;
1032failed:
1033        unlock_page(page);
1034        page_cache_release(page);
1035        return ret;
1036}
1037
1038/*
1039 * Create buffers for the specified block device block's page.  If
1040 * that page was dirty, the buffers are set dirty also.
1041 */
1042static int
1043grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp)
1044{
1045        pgoff_t index;
1046        int sizebits;
1047
1048        sizebits = -1;
1049        do {
1050                sizebits++;
1051        } while ((size << sizebits) < PAGE_SIZE);
1052
1053        index = block >> sizebits;
1054
1055        /*
1056         * Check for a block which wants to lie outside our maximum possible
1057         * pagecache index.  (this comparison is done using sector_t types).
1058         */
1059        if (unlikely(index != block >> sizebits)) {
1060                char b[BDEVNAME_SIZE];
1061
1062                printk(KERN_ERR "%s: requested out-of-range block %llu for "
1063                        "device %s\n",
1064                        __func__, (unsigned long long)block,
1065                        bdevname(bdev, b));
1066                return -EIO;
1067        }
1068
1069        /* Create a page with the proper size buffers.. */
1070        return grow_dev_page(bdev, block, index, size, sizebits, gfp);
1071}
1072
1073struct buffer_head *
1074__getblk_slow(struct block_device *bdev, sector_t block,
1075             unsigned size, gfp_t gfp)
1076{
1077        /* Size must be multiple of hard sectorsize */
1078        if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
1079                        (size < 512 || size > PAGE_SIZE))) {
1080                printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1081                                        size);
1082                printk(KERN_ERR "logical block size: %d\n",
1083                                        bdev_logical_block_size(bdev));
1084
1085                dump_stack();
1086                return NULL;
1087        }
1088
1089        for (;;) {
1090                struct buffer_head *bh;
1091                int ret;
1092
1093                bh = __find_get_block(bdev, block, size);
1094                if (bh)
1095                        return bh;
1096
1097                ret = grow_buffers(bdev, block, size, gfp);
1098                if (ret < 0)
1099                        return NULL;
1100                if (ret == 0)
1101                        free_more_memory();
1102        }
1103}
1104EXPORT_SYMBOL(__getblk_slow);
1105
1106/*
1107 * The relationship between dirty buffers and dirty pages:
1108 *
1109 * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1110 * the page is tagged dirty in its radix tree.
1111 *
1112 * At all times, the dirtiness of the buffers represents the dirtiness of
1113 * subsections of the page.  If the page has buffers, the page dirty bit is
1114 * merely a hint about the true dirty state.
1115 *
1116 * When a page is set dirty in its entirety, all its buffers are marked dirty
1117 * (if the page has buffers).
1118 *
1119 * When a buffer is marked dirty, its page is dirtied, but the page's other
1120 * buffers are not.
1121 *
1122 * Also.  When blockdev buffers are explicitly read with bread(), they
1123 * individually become uptodate.  But their backing page remains not
1124 * uptodate - even if all of its buffers are uptodate.  A subsequent
1125 * block_read_full_page() against that page will discover all the uptodate
1126 * buffers, will set the page uptodate and will perform no I/O.
1127 */
1128
1129/**
1130 * mark_buffer_dirty - mark a buffer_head as needing writeout
1131 * @bh: the buffer_head to mark dirty
1132 *
1133 * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
1134 * backing page dirty, then tag the page as dirty in its address_space's radix
1135 * tree and then attach the address_space's inode to its superblock's dirty
1136 * inode list.
1137 *
1138 * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
1139 * mapping->tree_lock and mapping->host->i_lock.
1140 */
1141void mark_buffer_dirty(struct buffer_head *bh)
1142{
1143        WARN_ON_ONCE(!buffer_uptodate(bh));
1144
1145        trace_block_dirty_buffer(bh);
1146
1147        /*
1148         * Very *carefully* optimize the it-is-already-dirty case.
1149         *
1150         * Don't let the final "is it dirty" escape to before we
1151         * perhaps modified the buffer.
1152         */
1153        if (buffer_dirty(bh)) {
1154                smp_mb();
1155                if (buffer_dirty(bh))
1156                        return;
1157        }
1158
1159        if (!test_set_buffer_dirty(bh)) {
1160                struct page *page = bh->b_page;
1161                if (!TestSetPageDirty(page)) {
1162                        struct address_space *mapping = page_mapping(page);
1163                        if (mapping)
1164                                __set_page_dirty(page, mapping, 0);
1165                }
1166        }
1167}
1168EXPORT_SYMBOL(mark_buffer_dirty);
1169
1170/*
1171 * Decrement a buffer_head's reference count.  If all buffers against a page
1172 * have zero reference count, are clean and unlocked, and if the page is clean
1173 * and unlocked then try_to_free_buffers() may strip the buffers from the page
1174 * in preparation for freeing it (sometimes, rarely, buffers are removed from
1175 * a page but it ends up not being freed, and buffers may later be reattached).
1176 */
1177void __brelse(struct buffer_head * buf)
1178{
1179        if (atomic_read(&buf->b_count)) {
1180                put_bh(buf);
1181                return;
1182        }
1183        WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1184}
1185EXPORT_SYMBOL(__brelse);
1186
1187/*
1188 * bforget() is like brelse(), except it discards any
1189 * potentially dirty data.
1190 */
1191void __bforget(struct buffer_head *bh)
1192{
1193        clear_buffer_dirty(bh);
1194        if (bh->b_assoc_map) {
1195                struct address_space *buffer_mapping = bh->b_page->mapping;
1196
1197                spin_lock(&buffer_mapping->private_lock);
1198                list_del_init(&bh->b_assoc_buffers);
1199                bh->b_assoc_map = NULL;
1200                spin_unlock(&buffer_mapping->private_lock);
1201        }
1202        __brelse(bh);
1203}
1204EXPORT_SYMBOL(__bforget);
1205
1206static struct buffer_head *__bread_slow(struct buffer_head *bh)
1207{
1208        lock_buffer(bh);
1209        if (buffer_uptodate(bh)) {
1210                unlock_buffer(bh);
1211                return bh;
1212        } else {
1213                get_bh(bh);
1214                bh->b_end_io = end_buffer_read_sync;
1215                submit_bh(READ, bh);
1216                wait_on_buffer(bh);
1217                if (buffer_uptodate(bh))
1218                        return bh;
1219        }
1220        brelse(bh);
1221        return NULL;
1222}
1223
1224/*
1225 * Per-cpu buffer LRU implementation.  To reduce the cost of __find_get_block().
1226 * The bhs[] array is sorted - newest buffer is at bhs[0].  Buffers have their
1227 * refcount elevated by one when they're in an LRU.  A buffer can only appear
1228 * once in a particular CPU's LRU.  A single buffer can be present in multiple
1229 * CPU's LRUs at the same time.
1230 *
1231 * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1232 * sb_find_get_block().
1233 *
1234 * The LRUs themselves only need locking against invalidate_bh_lrus.  We use
1235 * a local interrupt disable for that.
1236 */
1237
1238#define BH_LRU_SIZE     16
1239
1240struct bh_lru {
1241        struct buffer_head *bhs[BH_LRU_SIZE];
1242};
1243
1244static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1245
1246#ifdef CONFIG_SMP
1247#define bh_lru_lock()   local_irq_disable()
1248#define bh_lru_unlock() local_irq_enable()
1249#else
1250#define bh_lru_lock()   preempt_disable()
1251#define bh_lru_unlock() preempt_enable()
1252#endif
1253
1254static inline void check_irqs_on(void)
1255{
1256#ifdef irqs_disabled
1257        BUG_ON(irqs_disabled());
1258#endif
1259}
1260
1261/*
1262 * The LRU management algorithm is dopey-but-simple.  Sorry.
1263 */
1264static void bh_lru_install(struct buffer_head *bh)
1265{
1266        struct buffer_head *evictee = NULL;
1267
1268        check_irqs_on();
1269        bh_lru_lock();
1270        if (__this_cpu_read(bh_lrus.bhs[0]) != bh) {
1271                struct buffer_head *bhs[BH_LRU_SIZE];
1272                int in;
1273                int out = 0;
1274
1275                get_bh(bh);
1276                bhs[out++] = bh;
1277                for (in = 0; in < BH_LRU_SIZE; in++) {
1278                        struct buffer_head *bh2 =
1279                                __this_cpu_read(bh_lrus.bhs[in]);
1280
1281                        if (bh2 == bh) {
1282                                __brelse(bh2);
1283                        } else {
1284                                if (out >= BH_LRU_SIZE) {
1285                                        BUG_ON(evictee != NULL);
1286                                        evictee = bh2;
1287                                } else {
1288                                        bhs[out++] = bh2;
1289                                }
1290                        }
1291                }
1292                while (out < BH_LRU_SIZE)
1293                        bhs[out++] = NULL;
1294                memcpy(this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs));
1295        }
1296        bh_lru_unlock();
1297
1298        if (evictee)
1299                __brelse(evictee);
1300}
1301
1302/*
1303 * Look up the bh in this cpu's LRU.  If it's there, move it to the head.
1304 */
1305static struct buffer_head *
1306lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
1307{
1308        struct buffer_head *ret = NULL;
1309        unsigned int i;
1310
1311        check_irqs_on();
1312        bh_lru_lock();
1313        for (i = 0; i < BH_LRU_SIZE; i++) {
1314                struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
1315
1316                if (bh && bh->b_blocknr == block && bh->b_bdev == bdev &&
1317                    bh->b_size == size) {
1318                        if (i) {
1319                                while (i) {
1320                                        __this_cpu_write(bh_lrus.bhs[i],
1321                                                __this_cpu_read(bh_lrus.bhs[i - 1]));
1322                                        i--;
1323                                }
1324                                __this_cpu_write(bh_lrus.bhs[0], bh);
1325                        }
1326                        get_bh(bh);
1327                        ret = bh;
1328                        break;
1329                }
1330        }
1331        bh_lru_unlock();
1332        return ret;
1333}
1334
1335/*
1336 * Perform a pagecache lookup for the matching buffer.  If it's there, refresh
1337 * it in the LRU and mark it as accessed.  If it is not present then return
1338 * NULL
1339 */
1340struct buffer_head *
1341__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
1342{
1343        struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1344
1345        if (bh == NULL) {
1346                /* __find_get_block_slow will mark the page accessed */
1347                bh = __find_get_block_slow(bdev, block);
1348                if (bh)
1349                        bh_lru_install(bh);
1350        } else
1351                touch_buffer(bh);
1352
1353        return bh;
1354}
1355EXPORT_SYMBOL(__find_get_block);
1356
1357/*
1358 * __getblk_gfp() will locate (and, if necessary, create) the buffer_head
1359 * which corresponds to the passed block_device, block and size. The
1360 * returned buffer has its reference count incremented.
1361 *
1362 * __getblk_gfp() will lock up the machine if grow_dev_page's
1363 * try_to_free_buffers() attempt is failing.  FIXME, perhaps?
1364 */
1365struct buffer_head *
1366__getblk_gfp(struct block_device *bdev, sector_t block,
1367             unsigned size, gfp_t gfp)
1368{
1369        struct buffer_head *bh = __find_get_block(bdev, block, size);
1370
1371        might_sleep();
1372        if (bh == NULL)
1373                bh = __getblk_slow(bdev, block, size, gfp);
1374        return bh;
1375}
1376EXPORT_SYMBOL(__getblk_gfp);
1377
1378/*
1379 * Do async read-ahead on a buffer..
1380 */
1381void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
1382{
1383        struct buffer_head *bh = __getblk(bdev, block, size);
1384        if (likely(bh)) {
1385                ll_rw_block(READA, 1, &bh);
1386                brelse(bh);
1387        }
1388}
1389EXPORT_SYMBOL(__breadahead);
1390
1391/**
1392 *  __bread_gfp() - reads a specified block and returns the bh
1393 *  @bdev: the block_device to read from
1394 *  @block: number of block
1395 *  @size: size (in bytes) to read
1396 *  @gfp: page allocation flag
1397 *
1398 *  Reads a specified block, and returns buffer head that contains it.
1399 *  The page cache can be allocated from non-movable area
1400 *  not to prevent page migration if you set gfp to zero.
1401 *  It returns NULL if the block was unreadable.
1402 */
1403struct buffer_head *
1404__bread_gfp(struct block_device *bdev, sector_t block,
1405                   unsigned size, gfp_t gfp)
1406{
1407        struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp);
1408
1409        if (likely(bh) && !buffer_uptodate(bh))
1410                bh = __bread_slow(bh);
1411        return bh;
1412}
1413EXPORT_SYMBOL(__bread_gfp);
1414
1415/*
1416 * invalidate_bh_lrus() is called rarely - but not only at unmount.
1417 * This doesn't race because it runs in each cpu either in irq
1418 * or with preempt disabled.
1419 */
1420static void invalidate_bh_lru(void *arg)
1421{
1422        struct bh_lru *b = &get_cpu_var(bh_lrus);
1423        int i;
1424
1425        for (i = 0; i < BH_LRU_SIZE; i++) {
1426                brelse(b->bhs[i]);
1427                b->bhs[i] = NULL;
1428        }
1429        put_cpu_var(bh_lrus);
1430}
1431
1432static bool has_bh_in_lru(int cpu, void *dummy)
1433{
1434        struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu);
1435        int i;
1436        
1437        for (i = 0; i < BH_LRU_SIZE; i++) {
1438                if (b->bhs[i])
1439                        return 1;
1440        }
1441
1442        return 0;
1443}
1444
1445void invalidate_bh_lrus(void)
1446{
1447        on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1, GFP_KERNEL);
1448}
1449EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
1450
1451void set_bh_page(struct buffer_head *bh,
1452                struct page *page, unsigned long offset)
1453{
1454        bh->b_page = page;
1455        BUG_ON(offset >= PAGE_SIZE);
1456        if (PageHighMem(page))
1457                /*
1458                 * This catches illegal uses and preserves the offset:
1459                 */
1460                bh->b_data = (char *)(0 + offset);
1461        else
1462                bh->b_data = page_address(page) + offset;
1463}
1464EXPORT_SYMBOL(set_bh_page);
1465
1466/*
1467 * Called when truncating a buffer on a page completely.
1468 */
1469
1470/* Bits that are cleared during an invalidate */
1471#define BUFFER_FLAGS_DISCARD \
1472        (1 << BH_Mapped | 1 << BH_New | 1 << BH_Req | \
1473         1 << BH_Delay | 1 << BH_Unwritten)
1474
1475static void discard_buffer(struct buffer_head * bh)
1476{
1477        unsigned long b_state, b_state_old;
1478
1479        lock_buffer(bh);
1480        clear_buffer_dirty(bh);
1481        bh->b_bdev = NULL;
1482        b_state = bh->b_state;
1483        for (;;) {
1484                b_state_old = cmpxchg(&bh->b_state, b_state,
1485                                      (b_state & ~BUFFER_FLAGS_DISCARD));
1486                if (b_state_old == b_state)
1487                        break;
1488                b_state = b_state_old;
1489        }
1490        unlock_buffer(bh);
1491}
1492
1493/**
1494 * block_invalidatepage - invalidate part or all of a buffer-backed page
1495 *
1496 * @page: the page which is affected
1497 * @offset: start of the range to invalidate
1498 * @length: length of the range to invalidate
1499 *
1500 * block_invalidatepage() is called when all or part of the page has become
1501 * invalidated by a truncate operation.
1502 *
1503 * block_invalidatepage() does not have to release all buffers, but it must
1504 * ensure that no dirty buffer is left outside @offset and that no I/O
1505 * is underway against any of the blocks which are outside the truncation
1506 * point.  Because the caller is about to free (and possibly reuse) those
1507 * blocks on-disk.
1508 */
1509void block_invalidatepage(struct page *page, unsigned int offset,
1510                          unsigned int length)
1511{
1512        struct buffer_head *head, *bh, *next;
1513        unsigned int curr_off = 0;
1514        unsigned int stop = length + offset;
1515
1516        BUG_ON(!PageLocked(page));
1517        if (!page_has_buffers(page))
1518                goto out;
1519
1520        /*
1521         * Check for overflow
1522         */
1523        BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
1524
1525        head = page_buffers(page);
1526        bh = head;
1527        do {
1528                unsigned int next_off = curr_off + bh->b_size;
1529                next = bh->b_this_page;
1530
1531                /*
1532                 * Are we still fully in range ?
1533                 */
1534                if (next_off > stop)
1535                        goto out;
1536
1537                /*
1538                 * is this block fully invalidated?
1539                 */
1540                if (offset <= curr_off)
1541                        discard_buffer(bh);
1542                curr_off = next_off;
1543                bh = next;
1544        } while (bh != head);
1545
1546        /*
1547         * We release buffers only if the entire page is being invalidated.
1548         * The get_block cached value has been unconditionally invalidated,
1549         * so real IO is not possible anymore.
1550         */
1551        if (offset == 0)
1552                try_to_release_page(page, 0);
1553out:
1554        return;
1555}
1556EXPORT_SYMBOL(block_invalidatepage);
1557
1558
1559/*
1560 * We attach and possibly dirty the buffers atomically wrt
1561 * __set_page_dirty_buffers() via private_lock.  try_to_free_buffers
1562 * is already excluded via the page lock.
1563 */
1564void create_empty_buffers(struct page *page,
1565                        unsigned long blocksize, unsigned long b_state)
1566{
1567        struct buffer_head *bh, *head, *tail;
1568
1569        head = alloc_page_buffers(page, blocksize, 1);
1570        bh = head;
1571        do {
1572                bh->b_state |= b_state;
1573                tail = bh;
1574                bh = bh->b_this_page;
1575        } while (bh);
1576        tail->b_this_page = head;
1577
1578        spin_lock(&page->mapping->private_lock);
1579        if (PageUptodate(page) || PageDirty(page)) {
1580                bh = head;
1581                do {
1582                        if (PageDirty(page))
1583                                set_buffer_dirty(bh);
1584                        if (PageUptodate(page))
1585                                set_buffer_uptodate(bh);
1586                        bh = bh->b_this_page;
1587                } while (bh != head);
1588        }
1589        attach_page_buffers(page, head);
1590        spin_unlock(&page->mapping->private_lock);
1591}
1592EXPORT_SYMBOL(create_empty_buffers);
1593
1594/*
1595 * We are taking a block for data and we don't want any output from any
1596 * buffer-cache aliases starting from return from that function and
1597 * until the moment when something will explicitly mark the buffer
1598 * dirty (hopefully that will not happen until we will free that block ;-)
1599 * We don't even need to mark it not-uptodate - nobody can expect
1600 * anything from a newly allocated buffer anyway. We used to used
1601 * unmap_buffer() for such invalidation, but that was wrong. We definitely
1602 * don't want to mark the alias unmapped, for example - it would confuse
1603 * anyone who might pick it with bread() afterwards...
1604 *
1605 * Also..  Note that bforget() doesn't lock the buffer.  So there can
1606 * be writeout I/O going on against recently-freed buffers.  We don't
1607 * wait on that I/O in bforget() - it's more efficient to wait on the I/O
1608 * only if we really need to.  That happens here.
1609 */
1610void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
1611{
1612        struct buffer_head *old_bh;
1613
1614        might_sleep();
1615
1616        old_bh = __find_get_block_slow(bdev, block);
1617        if (old_bh) {
1618                clear_buffer_dirty(old_bh);
1619                wait_on_buffer(old_bh);
1620                clear_buffer_req(old_bh);
1621                __brelse(old_bh);
1622        }
1623}
1624EXPORT_SYMBOL(unmap_underlying_metadata);
1625
1626/*
1627 * Size is a power-of-two in the range 512..PAGE_SIZE,
1628 * and the case we care about most is PAGE_SIZE.
1629 *
1630 * So this *could* possibly be written with those
1631 * constraints in mind (relevant mostly if some
1632 * architecture has a slow bit-scan instruction)
1633 */
1634static inline int block_size_bits(unsigned int blocksize)
1635{
1636        return ilog2(blocksize);
1637}
1638
1639static struct buffer_head *create_page_buffers(struct page *page, struct inode *inode, unsigned int b_state)
1640{
1641        BUG_ON(!PageLocked(page));
1642
1643        if (!page_has_buffers(page))
1644                create_empty_buffers(page, 1 << ACCESS_ONCE(inode->i_blkbits), b_state);
1645        return page_buffers(page);
1646}
1647
1648/*
1649 * NOTE! All mapped/uptodate combinations are valid:
1650 *
1651 *      Mapped  Uptodate        Meaning
1652 *
1653 *      No      No              "unknown" - must do get_block()
1654 *      No      Yes             "hole" - zero-filled
1655 *      Yes     No              "allocated" - allocated on disk, not read in
1656 *      Yes     Yes             "valid" - allocated and up-to-date in memory.
1657 *
1658 * "Dirty" is valid only with the last case (mapped+uptodate).
1659 */
1660
1661/*
1662 * While block_write_full_page is writing back the dirty buffers under
1663 * the page lock, whoever dirtied the buffers may decide to clean them
1664 * again at any time.  We handle that by only looking at the buffer
1665 * state inside lock_buffer().
1666 *
1667 * If block_write_full_page() is called for regular writeback
1668 * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1669 * locked buffer.   This only can happen if someone has written the buffer
1670 * directly, with submit_bh().  At the address_space level PageWriteback
1671 * prevents this contention from occurring.
1672 *
1673 * If block_write_full_page() is called with wbc->sync_mode ==
1674 * WB_SYNC_ALL, the writes are posted using WRITE_SYNC; this
1675 * causes the writes to be flagged as synchronous writes.
1676 */
1677static int __block_write_full_page(struct inode *inode, struct page *page,
1678                        get_block_t *get_block, struct writeback_control *wbc,
1679                        bh_end_io_t *handler)
1680{
1681        int err;
1682        sector_t block;
1683        sector_t last_block;
1684        struct buffer_head *bh, *head;
1685        unsigned int blocksize, bbits;
1686        int nr_underway = 0;
1687        int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
1688                        WRITE_SYNC : WRITE);
1689
1690        head = create_page_buffers(page, inode,
1691                                        (1 << BH_Dirty)|(1 << BH_Uptodate));
1692
1693        /*
1694         * Be very careful.  We have no exclusion from __set_page_dirty_buffers
1695         * here, and the (potentially unmapped) buffers may become dirty at
1696         * any time.  If a buffer becomes dirty here after we've inspected it
1697         * then we just miss that fact, and the page stays dirty.
1698         *
1699         * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1700         * handle that here by just cleaning them.
1701         */
1702
1703        bh = head;
1704        blocksize = bh->b_size;
1705        bbits = block_size_bits(blocksize);
1706
1707        block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
1708        last_block = (i_size_read(inode) - 1) >> bbits;
1709
1710        /*
1711         * Get all the dirty buffers mapped to disk addresses and
1712         * handle any aliases from the underlying blockdev's mapping.
1713         */
1714        do {
1715                if (block > last_block) {
1716                        /*
1717                         * mapped buffers outside i_size will occur, because
1718                         * this page can be outside i_size when there is a
1719                         * truncate in progress.
1720                         */
1721                        /*
1722                         * The buffer was zeroed by block_write_full_page()
1723                         */
1724                        clear_buffer_dirty(bh);
1725                        set_buffer_uptodate(bh);
1726                } else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
1727                           buffer_dirty(bh)) {
1728                        WARN_ON(bh->b_size != blocksize);
1729                        err = get_block(inode, block, bh, 1);
1730                        if (err)
1731                                goto recover;
1732                        clear_buffer_delay(bh);
1733                        if (buffer_new(bh)) {
1734                                /* blockdev mappings never come here */
1735                                clear_buffer_new(bh);
1736                                unmap_underlying_metadata(bh->b_bdev,
1737                                                        bh->b_blocknr);
1738                        }
1739                }
1740                bh = bh->b_this_page;
1741                block++;
1742        } while (bh != head);
1743
1744        do {
1745                if (!buffer_mapped(bh))
1746                        continue;
1747                /*
1748                 * If it's a fully non-blocking write attempt and we cannot
1749                 * lock the buffer then redirty the page.  Note that this can
1750                 * potentially cause a busy-wait loop from writeback threads
1751                 * and kswapd activity, but those code paths have their own
1752                 * higher-level throttling.
1753                 */
1754                if (wbc->sync_mode != WB_SYNC_NONE) {
1755                        lock_buffer(bh);
1756                } else if (!trylock_buffer(bh)) {
1757                        redirty_page_for_writepage(wbc, page);
1758                        continue;
1759                }
1760                if (test_clear_buffer_dirty(bh)) {
1761                        mark_buffer_async_write_endio(bh, handler);
1762                } else {
1763                        unlock_buffer(bh);
1764                }
1765        } while ((bh = bh->b_this_page) != head);
1766
1767        /*
1768         * The page and its buffers are protected by PageWriteback(), so we can
1769         * drop the bh refcounts early.
1770         */
1771        BUG_ON(PageWriteback(page));
1772        set_page_writeback(page);
1773
1774        do {
1775                struct buffer_head *next = bh->b_this_page;
1776                if (buffer_async_write(bh)) {
1777                        submit_bh(write_op, bh);
1778                        nr_underway++;
1779                }
1780                bh = next;
1781        } while (bh != head);
1782        unlock_page(page);
1783
1784        err = 0;
1785done:
1786        if (nr_underway == 0) {
1787                /*
1788                 * The page was marked dirty, but the buffers were
1789                 * clean.  Someone wrote them back by hand with
1790                 * ll_rw_block/submit_bh.  A rare case.
1791                 */
1792                end_page_writeback(page);
1793
1794                /*
1795                 * The page and buffer_heads can be released at any time from
1796                 * here on.
1797                 */
1798        }
1799        return err;
1800
1801recover:
1802        /*
1803         * ENOSPC, or some other error.  We may already have added some
1804         * blocks to the file, so we need to write these out to avoid
1805         * exposing stale data.
1806         * The page is currently locked and not marked for writeback
1807         */
1808        bh = head;
1809        /* Recovery: lock and submit the mapped buffers */
1810        do {
1811                if (buffer_mapped(bh) && buffer_dirty(bh) &&
1812                    !buffer_delay(bh)) {
1813                        lock_buffer(bh);
1814                        mark_buffer_async_write_endio(bh, handler);
1815                } else {
1816                        /*
1817                         * The buffer may have been set dirty during
1818                         * attachment to a dirty page.
1819                         */
1820                        clear_buffer_dirty(bh);
1821                }
1822        } while ((bh = bh->b_this_page) != head);
1823        SetPageError(page);
1824        BUG_ON(PageWriteback(page));
1825        mapping_set_error(page->mapping, err);
1826        set_page_writeback(page);
1827        do {
1828                struct buffer_head *next = bh->b_this_page;
1829                if (buffer_async_write(bh)) {
1830                        clear_buffer_dirty(bh);
1831                        submit_bh(write_op, bh);
1832                        nr_underway++;
1833                }
1834                bh = next;
1835        } while (bh != head);
1836        unlock_page(page);
1837        goto done;
1838}
1839
1840/*
1841 * If a page has any new buffers, zero them out here, and mark them uptodate
1842 * and dirty so they'll be written out (in order to prevent uninitialised
1843 * block data from leaking). And clear the new bit.
1844 */
1845void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
1846{
1847        unsigned int block_start, block_end;
1848        struct buffer_head *head, *bh;
1849
1850        BUG_ON(!PageLocked(page));
1851        if (!page_has_buffers(page))
1852                return;
1853
1854        bh = head = page_buffers(page);
1855        block_start = 0;
1856        do {
1857                block_end = block_start + bh->b_size;
1858
1859                if (buffer_new(bh)) {
1860                        if (block_end > from && block_start < to) {
1861                                if (!PageUptodate(page)) {
1862                                        unsigned start, size;
1863
1864                                        start = max(from, block_start);
1865                                        size = min(to, block_end) - start;
1866
1867                                        zero_user(page, start, size);
1868                                        set_buffer_uptodate(bh);
1869                                }
1870
1871                                clear_buffer_new(bh);
1872                                mark_buffer_dirty(bh);
1873                        }
1874                }
1875
1876                block_start = block_end;
1877                bh = bh->b_this_page;
1878        } while (bh != head);
1879}
1880EXPORT_SYMBOL(page_zero_new_buffers);
1881
1882int __block_write_begin(struct page *page, loff_t pos, unsigned len,
1883                get_block_t *get_block)
1884{
1885        unsigned from = pos & (PAGE_CACHE_SIZE - 1);
1886        unsigned to = from + len;
1887        struct inode *inode = page->mapping->host;
1888        unsigned block_start, block_end;
1889        sector_t block;
1890        int err = 0;
1891        unsigned blocksize, bbits;
1892        struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1893
1894        BUG_ON(!PageLocked(page));
1895        BUG_ON(from > PAGE_CACHE_SIZE);
1896        BUG_ON(to > PAGE_CACHE_SIZE);
1897        BUG_ON(from > to);
1898
1899        head = create_page_buffers(page, inode, 0);
1900        blocksize = head->b_size;
1901        bbits = block_size_bits(blocksize);
1902
1903        block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
1904
1905        for(bh = head, block_start = 0; bh != head || !block_start;
1906            block++, block_start=block_end, bh = bh->b_this_page) {
1907                block_end = block_start + blocksize;
1908                if (block_end <= from || block_start >= to) {
1909                        if (PageUptodate(page)) {
1910                                if (!buffer_uptodate(bh))
1911                                        set_buffer_uptodate(bh);
1912                        }
1913                        continue;
1914                }
1915                if (buffer_new(bh))
1916                        clear_buffer_new(bh);
1917                if (!buffer_mapped(bh)) {
1918                        WARN_ON(bh->b_size != blocksize);
1919                        err = get_block(inode, block, bh, 1);
1920                        if (err)
1921                                break;
1922                        if (buffer_new(bh)) {
1923                                unmap_underlying_metadata(bh->b_bdev,
1924                                                        bh->b_blocknr);
1925                                if (PageUptodate(page)) {
1926                                        clear_buffer_new(bh);
1927                                        set_buffer_uptodate(bh);
1928                                        mark_buffer_dirty(bh);
1929                                        continue;
1930                                }
1931                                if (block_end > to || block_start < from)
1932                                        zero_user_segments(page,
1933                                                to, block_end,
1934                                                block_start, from);
1935                                continue;
1936                        }
1937                }
1938                if (PageUptodate(page)) {
1939                        if (!buffer_uptodate(bh))
1940                                set_buffer_uptodate(bh);
1941                        continue; 
1942                }
1943                if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
1944                    !buffer_unwritten(bh) &&
1945                     (block_start < from || block_end > to)) {
1946                        ll_rw_block(READ, 1, &bh);
1947                        *wait_bh++=bh;
1948                }
1949        }
1950        /*
1951         * If we issued read requests - let them complete.
1952         */
1953        while(wait_bh > wait) {
1954                wait_on_buffer(*--wait_bh);
1955                if (!buffer_uptodate(*wait_bh))
1956                        err = -EIO;
1957        }
1958        if (unlikely(err))
1959                page_zero_new_buffers(page, from, to);
1960        return err;
1961}
1962EXPORT_SYMBOL(__block_write_begin);
1963
1964static int __block_commit_write(struct inode *inode, struct page *page,
1965                unsigned from, unsigned to)
1966{
1967        unsigned block_start, block_end;
1968        int partial = 0;
1969        unsigned blocksize;
1970        struct buffer_head *bh, *head;
1971
1972        bh = head = page_buffers(page);
1973        blocksize = bh->b_size;
1974
1975        block_start = 0;
1976        do {
1977                block_end = block_start + blocksize;
1978                if (block_end <= from || block_start >= to) {
1979                        if (!buffer_uptodate(bh))
1980                                partial = 1;
1981                } else {
1982                        set_buffer_uptodate(bh);
1983                        mark_buffer_dirty(bh);
1984                }
1985                clear_buffer_new(bh);
1986
1987                block_start = block_end;
1988                bh = bh->b_this_page;
1989        } while (bh != head);
1990
1991        /*
1992         * If this is a partial write which happened to make all buffers
1993         * uptodate then we can optimize away a bogus readpage() for
1994         * the next read(). Here we 'discover' whether the page went
1995         * uptodate as a result of this (potentially partial) write.
1996         */
1997        if (!partial)
1998                SetPageUptodate(page);
1999        return 0;
2000}
2001
2002/*
2003 * block_write_begin takes care of the basic task of block allocation and
2004 * bringing partial write blocks uptodate first.
2005 *
2006 * The filesystem needs to handle block truncation upon failure.
2007 */
2008int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
2009                unsigned flags, struct page **pagep, get_block_t *get_block)
2010{
2011        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
2012        struct page *page;
2013        int status;
2014
2015        page = grab_cache_page_write_begin(mapping, index, flags);
2016        if (!page)
2017                return -ENOMEM;
2018
2019        status = __block_write_begin(page, pos, len, get_block);
2020        if (unlikely(status)) {
2021                unlock_page(page);
2022                page_cache_release(page);
2023                page = NULL;
2024        }
2025
2026        *pagep = page;
2027        return status;
2028}
2029EXPORT_SYMBOL(block_write_begin);
2030
2031int block_write_end(struct file *file, struct address_space *mapping,
2032                        loff_t pos, unsigned len, unsigned copied,
2033                        struct page *page, void *fsdata)
2034{
2035        struct inode *inode = mapping->host;
2036        unsigned start;
2037
2038        start = pos & (PAGE_CACHE_SIZE - 1);
2039
2040        if (unlikely(copied < len)) {
2041                /*
2042                 * The buffers that were written will now be uptodate, so we
2043                 * don't have to worry about a readpage reading them and
2044                 * overwriting a partial write. However if we have encountered
2045                 * a short write and only partially written into a buffer, it
2046                 * will not be marked uptodate, so a readpage might come in and
2047                 * destroy our partial write.
2048                 *
2049                 * Do the simplest thing, and just treat any short write to a
2050                 * non uptodate page as a zero-length write, and force the
2051                 * caller to redo the whole thing.
2052                 */
2053                if (!PageUptodate(page))
2054                        copied = 0;
2055
2056                page_zero_new_buffers(page, start+copied, start+len);
2057        }
2058        flush_dcache_page(page);
2059
2060        /* This could be a short (even 0-length) commit */
2061        __block_commit_write(inode, page, start, start+copied);
2062
2063        return copied;
2064}
2065EXPORT_SYMBOL(block_write_end);
2066
2067int generic_write_end(struct file *file, struct address_space *mapping,
2068                        loff_t pos, unsigned len, unsigned copied,
2069                        struct page *page, void *fsdata)
2070{
2071        struct inode *inode = mapping->host;
2072        loff_t old_size = inode->i_size;
2073        int i_size_changed = 0;
2074
2075        copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
2076
2077        /*
2078         * No need to use i_size_read() here, the i_size
2079         * cannot change under us because we hold i_mutex.
2080         *
2081         * But it's important to update i_size while still holding page lock:
2082         * page writeout could otherwise come in and zero beyond i_size.
2083         */
2084        if (pos+copied > inode->i_size) {
2085                i_size_write(inode, pos+copied);
2086                i_size_changed = 1;
2087        }
2088
2089        unlock_page(page);
2090        page_cache_release(page);
2091
2092        if (old_size < pos)
2093                pagecache_isize_extended(inode, old_size, pos);
2094        /*
2095         * Don't mark the inode dirty under page lock. First, it unnecessarily
2096         * makes the holding time of page lock longer. Second, it forces lock
2097         * ordering of page lock and transaction start for journaling
2098         * filesystems.
2099         */
2100        if (i_size_changed)
2101                mark_inode_dirty(inode);
2102
2103        return copied;
2104}
2105EXPORT_SYMBOL(generic_write_end);
2106
2107/*
2108 * block_is_partially_uptodate checks whether buffers within a page are
2109 * uptodate or not.
2110 *
2111 * Returns true if all buffers which correspond to a file portion
2112 * we want to read are uptodate.
2113 */
2114int block_is_partially_uptodate(struct page *page, unsigned long from,
2115                                        unsigned long count)
2116{
2117        unsigned block_start, block_end, blocksize;
2118        unsigned to;
2119        struct buffer_head *bh, *head;
2120        int ret = 1;
2121
2122        if (!page_has_buffers(page))
2123                return 0;
2124
2125        head = page_buffers(page);
2126        blocksize = head->b_size;
2127        to = min_t(unsigned, PAGE_CACHE_SIZE - from, count);
2128        to = from + to;
2129        if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
2130                return 0;
2131
2132        bh = head;
2133        block_start = 0;
2134        do {
2135                block_end = block_start + blocksize;
2136                if (block_end > from && block_start < to) {
2137                        if (!buffer_uptodate(bh)) {
2138                                ret = 0;
2139                                break;
2140                        }
2141                        if (block_end >= to)
2142                                break;
2143                }
2144                block_start = block_end;
2145                bh = bh->b_this_page;
2146        } while (bh != head);
2147
2148        return ret;
2149}
2150EXPORT_SYMBOL(block_is_partially_uptodate);
2151
2152/*
2153 * Generic "read page" function for block devices that have the normal
2154 * get_block functionality. This is most of the block device filesystems.
2155 * Reads the page asynchronously --- the unlock_buffer() and
2156 * set/clear_buffer_uptodate() functions propagate buffer state into the
2157 * page struct once IO has completed.
2158 */
2159int block_read_full_page(struct page *page, get_block_t *get_block)
2160{
2161        struct inode *inode = page->mapping->host;
2162        sector_t iblock, lblock;
2163        struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
2164        unsigned int blocksize, bbits;
2165        int nr, i;
2166        int fully_mapped = 1;
2167
2168        head = create_page_buffers(page, inode, 0);
2169        blocksize = head->b_size;
2170        bbits = block_size_bits(blocksize);
2171
2172        iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
2173        lblock = (i_size_read(inode)+blocksize-1) >> bbits;
2174        bh = head;
2175        nr = 0;
2176        i = 0;
2177
2178        do {
2179                if (buffer_uptodate(bh))
2180                        continue;
2181
2182                if (!buffer_mapped(bh)) {
2183                        int err = 0;
2184
2185                        fully_mapped = 0;
2186                        if (iblock < lblock) {
2187                                WARN_ON(bh->b_size != blocksize);
2188                                err = get_block(inode, iblock, bh, 0);
2189                                if (err)
2190                                        SetPageError(page);
2191                        }
2192                        if (!buffer_mapped(bh)) {
2193                                zero_user(page, i * blocksize, blocksize);
2194                                if (!err)
2195                                        set_buffer_uptodate(bh);
2196                                continue;
2197                        }
2198                        /*
2199                         * get_block() might have updated the buffer
2200                         * synchronously
2201                         */
2202                        if (buffer_uptodate(bh))
2203                                continue;
2204                }
2205                arr[nr++] = bh;
2206        } while (i++, iblock++, (bh = bh->b_this_page) != head);
2207
2208        if (fully_mapped)
2209                SetPageMappedToDisk(page);
2210
2211        if (!nr) {
2212                /*
2213                 * All buffers are uptodate - we can set the page uptodate
2214                 * as well. But not if get_block() returned an error.
2215                 */
2216                if (!PageError(page))
2217                        SetPageUptodate(page);
2218                unlock_page(page);
2219                return 0;
2220        }
2221
2222        /* Stage two: lock the buffers */
2223        for (i = 0; i < nr; i++) {
2224                bh = arr[i];
2225                lock_buffer(bh);
2226                mark_buffer_async_read(bh);
2227        }
2228
2229        /*
2230         * Stage 3: start the IO.  Check for uptodateness
2231         * inside the buffer lock in case another process reading
2232         * the underlying blockdev brought it uptodate (the sct fix).
2233         */
2234        for (i = 0; i < nr; i++) {
2235                bh = arr[i];
2236                if (buffer_uptodate(bh))
2237                        end_buffer_async_read(bh, 1);
2238                else
2239                        submit_bh(READ, bh);
2240        }
2241        return 0;
2242}
2243EXPORT_SYMBOL(block_read_full_page);
2244
2245/* utility function for filesystems that need to do work on expanding
2246 * truncates.  Uses filesystem pagecache writes to allow the filesystem to
2247 * deal with the hole.  
2248 */
2249int generic_cont_expand_simple(struct inode *inode, loff_t size)
2250{
2251        struct address_space *mapping = inode->i_mapping;
2252        struct page *page;
2253        void *fsdata;
2254        int err;
2255
2256        err = inode_newsize_ok(inode, size);
2257        if (err)
2258                goto out;
2259
2260        err = pagecache_write_begin(NULL, mapping, size, 0,
2261                                AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND,
2262                                &page, &fsdata);
2263        if (err)
2264                goto out;
2265
2266        err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata);
2267        BUG_ON(err > 0);
2268
2269out:
2270        return err;
2271}
2272EXPORT_SYMBOL(generic_cont_expand_simple);
2273
2274static int cont_expand_zero(struct file *file, struct address_space *mapping,
2275                            loff_t pos, loff_t *bytes)
2276{
2277        struct inode *inode = mapping->host;
2278        unsigned blocksize = 1 << inode->i_blkbits;
2279        struct page *page;
2280        void *fsdata;
2281        pgoff_t index, curidx;
2282        loff_t curpos;
2283        unsigned zerofrom, offset, len;
2284        int err = 0;
2285
2286        index = pos >> PAGE_CACHE_SHIFT;
2287        offset = pos & ~PAGE_CACHE_MASK;
2288
2289        while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) {
2290                zerofrom = curpos & ~PAGE_CACHE_MASK;
2291                if (zerofrom & (blocksize-1)) {
2292                        *bytes |= (blocksize-1);
2293                        (*bytes)++;
2294                }
2295                len = PAGE_CACHE_SIZE - zerofrom;
2296
2297                err = pagecache_write_begin(file, mapping, curpos, len,
2298                                                AOP_FLAG_UNINTERRUPTIBLE,
2299                                                &page, &fsdata);
2300                if (err)
2301                        goto out;
2302                zero_user(page, zerofrom, len);
2303                err = pagecache_write_end(file, mapping, curpos, len, len,
2304                                                page, fsdata);
2305                if (err < 0)
2306                        goto out;
2307                BUG_ON(err != len);
2308                err = 0;
2309
2310                balance_dirty_pages_ratelimited(mapping);
2311
2312                if (unlikely(fatal_signal_pending(current))) {
2313                        err = -EINTR;
2314                        goto out;
2315                }
2316        }
2317
2318        /* page covers the boundary, find the boundary offset */
2319        if (index == curidx) {
2320                zerofrom = curpos & ~PAGE_CACHE_MASK;
2321                /* if we will expand the thing last block will be filled */
2322                if (offset <= zerofrom) {
2323                        goto out;
2324                }
2325                if (zerofrom & (blocksize-1)) {
2326                        *bytes |= (blocksize-1);
2327                        (*bytes)++;
2328                }
2329                len = offset - zerofrom;
2330
2331                err = pagecache_write_begin(file, mapping, curpos, len,
2332                                                AOP_FLAG_UNINTERRUPTIBLE,
2333                                                &page, &fsdata);
2334                if (err)
2335                        goto out;
2336                zero_user(page, zerofrom, len);
2337                err = pagecache_write_end(file, mapping, curpos, len, len,
2338                                                page, fsdata);
2339                if (err < 0)
2340                        goto out;
2341                BUG_ON(err != len);
2342                err = 0;
2343        }
2344out:
2345        return err;
2346}
2347
2348/*
2349 * For moronic filesystems that do not allow holes in file.
2350 * We may have to extend the file.
2351 */
2352int cont_write_begin(struct file *file, struct address_space *mapping,
2353                        loff_t pos, unsigned len, unsigned flags,
2354                        struct page **pagep, void **fsdata,
2355                        get_block_t *get_block, loff_t *bytes)
2356{
2357        struct inode *inode = mapping->host;
2358        unsigned blocksize = 1 << inode->i_blkbits;
2359        unsigned zerofrom;
2360        int err;
2361
2362        err = cont_expand_zero(file, mapping, pos, bytes);
2363        if (err)
2364                return err;
2365
2366        zerofrom = *bytes & ~PAGE_CACHE_MASK;
2367        if (pos+len > *bytes && zerofrom & (blocksize-1)) {
2368                *bytes |= (blocksize-1);
2369                (*bytes)++;
2370        }
2371
2372        return block_write_begin(mapping, pos, len, flags, pagep, get_block);
2373}
2374EXPORT_SYMBOL(cont_write_begin);
2375
2376int block_commit_write(struct page *page, unsigned from, unsigned to)
2377{
2378        struct inode *inode = page->mapping->host;
2379        __block_commit_write(inode,page,from,to);
2380        return 0;
2381}
2382EXPORT_SYMBOL(block_commit_write);
2383
2384/*
2385 * block_page_mkwrite() is not allowed to change the file size as it gets
2386 * called from a page fault handler when a page is first dirtied. Hence we must
2387 * be careful to check for EOF conditions here. We set the page up correctly
2388 * for a written page which means we get ENOSPC checking when writing into
2389 * holes and correct delalloc and unwritten extent mapping on filesystems that
2390 * support these features.
2391 *
2392 * We are not allowed to take the i_mutex here so we have to play games to
2393 * protect against truncate races as the page could now be beyond EOF.  Because
2394 * truncate writes the inode size before removing pages, once we have the
2395 * page lock we can determine safely if the page is beyond EOF. If it is not
2396 * beyond EOF, then the page is guaranteed safe against truncation until we
2397 * unlock the page.
2398 *
2399 * Direct callers of this function should protect against filesystem freezing
2400 * using sb_start_write() - sb_end_write() functions.
2401 */
2402int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2403                         get_block_t get_block)
2404{
2405        struct page *page = vmf->page;
2406        struct inode *inode = file_inode(vma->vm_file);
2407        unsigned long end;
2408        loff_t size;
2409        int ret;
2410
2411        lock_page(page);
2412        size = i_size_read(inode);
2413        if ((page->mapping != inode->i_mapping) ||
2414            (page_offset(page) > size)) {
2415                /* We overload EFAULT to mean page got truncated */
2416                ret = -EFAULT;
2417                goto out_unlock;
2418        }
2419
2420        /* page is wholly or partially inside EOF */
2421        if (((page->index + 1) << PAGE_CACHE_SHIFT) > size)
2422                end = size & ~PAGE_CACHE_MASK;
2423        else
2424                end = PAGE_CACHE_SIZE;
2425
2426        ret = __block_write_begin(page, 0, end, get_block);
2427        if (!ret)
2428                ret = block_commit_write(page, 0, end);
2429
2430        if (unlikely(ret < 0))
2431                goto out_unlock;
2432        set_page_dirty(page);
2433        wait_for_stable_page(page);
2434        return 0;
2435out_unlock:
2436        unlock_page(page);
2437        return ret;
2438}
2439EXPORT_SYMBOL(__block_page_mkwrite);
2440
2441int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2442                   get_block_t get_block)
2443{
2444        int ret;
2445        struct super_block *sb = file_inode(vma->vm_file)->i_sb;
2446
2447        sb_start_pagefault(sb);
2448
2449        /*
2450         * Update file times before taking page lock. We may end up failing the
2451         * fault so this update may be superfluous but who really cares...
2452         */
2453        file_update_time(vma->vm_file);
2454
2455        ret = __block_page_mkwrite(vma, vmf, get_block);
2456        sb_end_pagefault(sb);
2457        return block_page_mkwrite_return(ret);
2458}
2459EXPORT_SYMBOL(block_page_mkwrite);
2460
2461/*
2462 * nobh_write_begin()'s prereads are special: the buffer_heads are freed
2463 * immediately, while under the page lock.  So it needs a special end_io
2464 * handler which does not touch the bh after unlocking it.
2465 */
2466static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
2467{
2468        __end_buffer_read_notouch(bh, uptodate);
2469}
2470
2471/*
2472 * Attach the singly-linked list of buffers created by nobh_write_begin, to
2473 * the page (converting it to circular linked list and taking care of page
2474 * dirty races).
2475 */
2476static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
2477{
2478        struct buffer_head *bh;
2479
2480        BUG_ON(!PageLocked(page));
2481
2482        spin_lock(&page->mapping->private_lock);
2483        bh = head;
2484        do {
2485                if (PageDirty(page))
2486                        set_buffer_dirty(bh);
2487                if (!bh->b_this_page)
2488                        bh->b_this_page = head;
2489                bh = bh->b_this_page;
2490        } while (bh != head);
2491        attach_page_buffers(page, head);
2492        spin_unlock(&page->mapping->private_lock);
2493}
2494
2495/*
2496 * On entry, the page is fully not uptodate.
2497 * On exit the page is fully uptodate in the areas outside (from,to)
2498 * The filesystem needs to handle block truncation upon failure.
2499 */
2500int nobh_write_begin(struct address_space *mapping,
2501                        loff_t pos, unsigned len, unsigned flags,
2502                        struct page **pagep, void **fsdata,
2503                        get_block_t *get_block)
2504{
2505        struct inode *inode = mapping->host;
2506        const unsigned blkbits = inode->i_blkbits;
2507        const unsigned blocksize = 1 << blkbits;
2508        struct buffer_head *head, *bh;
2509        struct page *page;
2510        pgoff_t index;
2511        unsigned from, to;
2512        unsigned block_in_page;
2513        unsigned block_start, block_end;
2514        sector_t block_in_file;
2515        int nr_reads = 0;
2516        int ret = 0;
2517        int is_mapped_to_disk = 1;
2518
2519        index = pos >> PAGE_CACHE_SHIFT;
2520        from = pos & (PAGE_CACHE_SIZE - 1);
2521        to = from + len;
2522
2523        page = grab_cache_page_write_begin(mapping, index, flags);
2524        if (!page)
2525                return -ENOMEM;
2526        *pagep = page;
2527        *fsdata = NULL;
2528
2529        if (page_has_buffers(page)) {
2530                ret = __block_write_begin(page, pos, len, get_block);
2531                if (unlikely(ret))
2532                        goto out_release;
2533                return ret;
2534        }
2535
2536        if (PageMappedToDisk(page))
2537                return 0;
2538
2539        /*
2540         * Allocate buffers so that we can keep track of state, and potentially
2541         * attach them to the page if an error occurs. In the common case of
2542         * no error, they will just be freed again without ever being attached
2543         * to the page (which is all OK, because we're under the page lock).
2544         *
2545         * Be careful: the buffer linked list is a NULL terminated one, rather
2546         * than the circular one we're used to.
2547         */
2548        head = alloc_page_buffers(page, blocksize, 0);
2549        if (!head) {
2550                ret = -ENOMEM;
2551                goto out_release;
2552        }
2553
2554        block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
2555
2556        /*
2557         * We loop across all blocks in the page, whether or not they are
2558         * part of the affected region.  This is so we can discover if the
2559         * page is fully mapped-to-disk.
2560         */
2561        for (block_start = 0, block_in_page = 0, bh = head;
2562                  block_start < PAGE_CACHE_SIZE;
2563                  block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
2564                int create;
2565
2566                block_end = block_start + blocksize;
2567                bh->b_state = 0;
2568                create = 1;
2569                if (block_start >= to)
2570                        create = 0;
2571                ret = get_block(inode, block_in_file + block_in_page,
2572                                        bh, create);
2573                if (ret)
2574                        goto failed;
2575                if (!buffer_mapped(bh))
2576                        is_mapped_to_disk = 0;
2577                if (buffer_new(bh))
2578                        unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
2579                if (PageUptodate(page)) {
2580                        set_buffer_uptodate(bh);
2581                        continue;
2582                }
2583                if (buffer_new(bh) || !buffer_mapped(bh)) {
2584                        zero_user_segments(page, block_start, from,
2585                                                        to, block_end);
2586                        continue;
2587                }
2588                if (buffer_uptodate(bh))
2589                        continue;       /* reiserfs does this */
2590                if (block_start < from || block_end > to) {
2591                        lock_buffer(bh);
2592                        bh->b_end_io = end_buffer_read_nobh;
2593                        submit_bh(READ, bh);
2594                        nr_reads++;
2595                }
2596        }
2597
2598        if (nr_reads) {
2599                /*
2600                 * The page is locked, so these buffers are protected from
2601                 * any VM or truncate activity.  Hence we don't need to care
2602                 * for the buffer_head refcounts.
2603                 */
2604                for (bh = head; bh; bh = bh->b_this_page) {
2605                        wait_on_buffer(bh);
2606                        if (!buffer_uptodate(bh))
2607                                ret = -EIO;
2608                }
2609                if (ret)
2610                        goto failed;
2611        }
2612
2613        if (is_mapped_to_disk)
2614                SetPageMappedToDisk(page);
2615
2616        *fsdata = head; /* to be released by nobh_write_end */
2617
2618        return 0;
2619
2620failed:
2621        BUG_ON(!ret);
2622        /*
2623         * Error recovery is a bit difficult. We need to zero out blocks that
2624         * were newly allocated, and dirty them to ensure they get written out.
2625         * Buffers need to be attached to the page at this point, otherwise
2626         * the handling of potential IO errors during writeout would be hard
2627         * (could try doing synchronous writeout, but what if that fails too?)
2628         */
2629        attach_nobh_buffers(page, head);
2630        page_zero_new_buffers(page, from, to);
2631
2632out_release:
2633        unlock_page(page);
2634        page_cache_release(page);
2635        *pagep = NULL;
2636
2637        return ret;
2638}
2639EXPORT_SYMBOL(nobh_write_begin);
2640
2641int nobh_write_end(struct file *file, struct address_space *mapping,
2642                        loff_t pos, unsigned len, unsigned copied,
2643                        struct page *page, void *fsdata)
2644{
2645        struct inode *inode = page->mapping->host;
2646        struct buffer_head *head = fsdata;
2647        struct buffer_head *bh;
2648        BUG_ON(fsdata != NULL && page_has_buffers(page));
2649
2650        if (unlikely(copied < len) && head)
2651                attach_nobh_buffers(page, head);
2652        if (page_has_buffers(page))
2653                return generic_write_end(file, mapping, pos, len,
2654                                        copied, page, fsdata);
2655
2656        SetPageUptodate(page);
2657        set_page_dirty(page);
2658        if (pos+copied > inode->i_size) {
2659                i_size_write(inode, pos+copied);
2660                mark_inode_dirty(inode);
2661        }
2662
2663        unlock_page(page);
2664        page_cache_release(page);
2665
2666        while (head) {
2667                bh = head;
2668                head = head->b_this_page;
2669                free_buffer_head(bh);
2670        }
2671
2672        return copied;
2673}
2674EXPORT_SYMBOL(nobh_write_end);
2675
2676/*
2677 * nobh_writepage() - based on block_full_write_page() except
2678 * that it tries to operate without attaching bufferheads to
2679 * the page.
2680 */
2681int nobh_writepage(struct page *page, get_block_t *get_block,
2682                        struct writeback_control *wbc)
2683{
2684        struct inode * const inode = page->mapping->host;
2685        loff_t i_size = i_size_read(inode);
2686        const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2687        unsigned offset;
2688        int ret;
2689
2690        /* Is the page fully inside i_size? */
2691        if (page->index < end_index)
2692                goto out;
2693
2694        /* Is the page fully outside i_size? (truncate in progress) */
2695        offset = i_size & (PAGE_CACHE_SIZE-1);
2696        if (page->index >= end_index+1 || !offset) {
2697                /*
2698                 * The page may have dirty, unmapped buffers.  For example,
2699                 * they may have been added in ext3_writepage().  Make them
2700                 * freeable here, so the page does not leak.
2701                 */
2702#if 0
2703                /* Not really sure about this  - do we need this ? */
2704                if (page->mapping->a_ops->invalidatepage)
2705                        page->mapping->a_ops->invalidatepage(page, offset);
2706#endif
2707                unlock_page(page);
2708                return 0; /* don't care */
2709        }
2710
2711        /*
2712         * The page straddles i_size.  It must be zeroed out on each and every
2713         * writepage invocation because it may be mmapped.  "A file is mapped
2714         * in multiples of the page size.  For a file that is not a multiple of
2715         * the  page size, the remaining memory is zeroed when mapped, and
2716         * writes to that region are not written out to the file."
2717         */
2718        zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2719out:
2720        ret = mpage_writepage(page, get_block, wbc);
2721        if (ret == -EAGAIN)
2722                ret = __block_write_full_page(inode, page, get_block, wbc,
2723                                              end_buffer_async_write);
2724        return ret;
2725}
2726EXPORT_SYMBOL(nobh_writepage);
2727
2728int nobh_truncate_page(struct address_space *mapping,
2729                        loff_t from, get_block_t *get_block)
2730{
2731        pgoff_t index = from >> PAGE_CACHE_SHIFT;
2732        unsigned offset = from & (PAGE_CACHE_SIZE-1);
2733        unsigned blocksize;
2734        sector_t iblock;
2735        unsigned length, pos;
2736        struct inode *inode = mapping->host;
2737        struct page *page;
2738        struct buffer_head map_bh;
2739        int err;
2740
2741        blocksize = 1 << inode->i_blkbits;
2742        length = offset & (blocksize - 1);
2743
2744        /* Block boundary? Nothing to do */
2745        if (!length)
2746                return 0;
2747
2748        length = blocksize - length;
2749        iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2750
2751        page = grab_cache_page(mapping, index);
2752        err = -ENOMEM;
2753        if (!page)
2754                goto out;
2755
2756        if (page_has_buffers(page)) {
2757has_buffers:
2758                unlock_page(page);
2759                page_cache_release(page);
2760                return block_truncate_page(mapping, from, get_block);
2761        }
2762
2763        /* Find the buffer that contains "offset" */
2764        pos = blocksize;
2765        while (offset >= pos) {
2766                iblock++;
2767                pos += blocksize;
2768        }
2769
2770        map_bh.b_size = blocksize;
2771        map_bh.b_state = 0;
2772        err = get_block(inode, iblock, &map_bh, 0);
2773        if (err)
2774                goto unlock;
2775        /* unmapped? It's a hole - nothing to do */
2776        if (!buffer_mapped(&map_bh))
2777                goto unlock;
2778
2779        /* Ok, it's mapped. Make sure it's up-to-date */
2780        if (!PageUptodate(page)) {
2781                err = mapping->a_ops->readpage(NULL, page);
2782                if (err) {
2783                        page_cache_release(page);
2784                        goto out;
2785                }
2786                lock_page(page);
2787                if (!PageUptodate(page)) {
2788                        err = -EIO;
2789                        goto unlock;
2790                }
2791                if (page_has_buffers(page))
2792                        goto has_buffers;
2793        }
2794        zero_user(page, offset, length);
2795        set_page_dirty(page);
2796        err = 0;
2797
2798unlock:
2799        unlock_page(page);
2800        page_cache_release(page);
2801out:
2802        return err;
2803}
2804EXPORT_SYMBOL(nobh_truncate_page);
2805
2806int block_truncate_page(struct address_space *mapping,
2807                        loff_t from, get_block_t *get_block)
2808{
2809        pgoff_t index = from >> PAGE_CACHE_SHIFT;
2810        unsigned offset = from & (PAGE_CACHE_SIZE-1);
2811        unsigned blocksize;
2812        sector_t iblock;
2813        unsigned length, pos;
2814        struct inode *inode = mapping->host;
2815        struct page *page;
2816        struct buffer_head *bh;
2817        int err;
2818
2819        blocksize = 1 << inode->i_blkbits;
2820        length = offset & (blocksize - 1);
2821
2822        /* Block boundary? Nothing to do */
2823        if (!length)
2824                return 0;
2825
2826        length = blocksize - length;
2827        iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2828        
2829        page = grab_cache_page(mapping, index);
2830        err = -ENOMEM;
2831        if (!page)
2832                goto out;
2833
2834        if (!page_has_buffers(page))
2835                create_empty_buffers(page, blocksize, 0);
2836
2837        /* Find the buffer that contains "offset" */
2838        bh = page_buffers(page);
2839        pos = blocksize;
2840        while (offset >= pos) {
2841                bh = bh->b_this_page;
2842                iblock++;
2843                pos += blocksize;
2844        }
2845
2846        err = 0;
2847        if (!buffer_mapped(bh)) {
2848                WARN_ON(bh->b_size != blocksize);
2849                err = get_block(inode, iblock, bh, 0);
2850                if (err)
2851                        goto unlock;
2852                /* unmapped? It's a hole - nothing to do */
2853                if (!buffer_mapped(bh))
2854                        goto unlock;
2855        }
2856
2857        /* Ok, it's mapped. Make sure it's up-to-date */
2858        if (PageUptodate(page))
2859                set_buffer_uptodate(bh);
2860
2861        if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
2862                err = -EIO;
2863                ll_rw_block(READ, 1, &bh);
2864                wait_on_buffer(bh);
2865                /* Uhhuh. Read error. Complain and punt. */
2866                if (!buffer_uptodate(bh))
2867                        goto unlock;
2868        }
2869
2870        zero_user(page, offset, length);
2871        mark_buffer_dirty(bh);
2872        err = 0;
2873
2874unlock:
2875        unlock_page(page);
2876        page_cache_release(page);
2877out:
2878        return err;
2879}
2880EXPORT_SYMBOL(block_truncate_page);
2881
2882/*
2883 * The generic ->writepage function for buffer-backed address_spaces
2884 */
2885int block_write_full_page(struct page *page, get_block_t *get_block,
2886                        struct writeback_control *wbc)
2887{
2888        struct inode * const inode = page->mapping->host;
2889        loff_t i_size = i_size_read(inode);
2890        const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2891        unsigned offset;
2892
2893        /* Is the page fully inside i_size? */
2894        if (page->index < end_index)
2895                return __block_write_full_page(inode, page, get_block, wbc,
2896                                               end_buffer_async_write);
2897
2898        /* Is the page fully outside i_size? (truncate in progress) */
2899        offset = i_size & (PAGE_CACHE_SIZE-1);
2900        if (page->index >= end_index+1 || !offset) {
2901                /*
2902                 * The page may have dirty, unmapped buffers.  For example,
2903                 * they may have been added in ext3_writepage().  Make them
2904                 * freeable here, so the page does not leak.
2905                 */
2906                do_invalidatepage(page, 0, PAGE_CACHE_SIZE);
2907                unlock_page(page);
2908                return 0; /* don't care */
2909        }
2910
2911        /*
2912         * The page straddles i_size.  It must be zeroed out on each and every
2913         * writepage invocation because it may be mmapped.  "A file is mapped
2914         * in multiples of the page size.  For a file that is not a multiple of
2915         * the  page size, the remaining memory is zeroed when mapped, and
2916         * writes to that region are not written out to the file."
2917         */
2918        zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2919        return __block_write_full_page(inode, page, get_block, wbc,
2920                                                        end_buffer_async_write);
2921}
2922EXPORT_SYMBOL(block_write_full_page);
2923
2924sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2925                            get_block_t *get_block)
2926{
2927        struct buffer_head tmp;
2928        struct inode *inode = mapping->host;
2929        tmp.b_state = 0;
2930        tmp.b_blocknr = 0;
2931        tmp.b_size = 1 << inode->i_blkbits;
2932        get_block(inode, block, &tmp, 0);
2933        return tmp.b_blocknr;
2934}
2935EXPORT_SYMBOL(generic_block_bmap);
2936
2937static void end_bio_bh_io_sync(struct bio *bio, int err)
2938{
2939        struct buffer_head *bh = bio->bi_private;
2940
2941        if (err == -EOPNOTSUPP) {
2942                set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2943        }
2944
2945        if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
2946                set_bit(BH_Quiet, &bh->b_state);
2947
2948        bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
2949        bio_put(bio);
2950}
2951
2952/*
2953 * This allows us to do IO even on the odd last sectors
2954 * of a device, even if the block size is some multiple
2955 * of the physical sector size.
2956 *
2957 * We'll just truncate the bio to the size of the device,
2958 * and clear the end of the buffer head manually.
2959 *
2960 * Truly out-of-range accesses will turn into actual IO
2961 * errors, this only handles the "we need to be able to
2962 * do IO at the final sector" case.
2963 */
2964void guard_bio_eod(int rw, struct bio *bio)
2965{
2966        sector_t maxsector;
2967        struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
2968        unsigned truncated_bytes;
2969
2970        maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9;
2971        if (!maxsector)
2972                return;
2973
2974        /*
2975         * If the *whole* IO is past the end of the device,
2976         * let it through, and the IO layer will turn it into
2977         * an EIO.
2978         */
2979        if (unlikely(bio->bi_iter.bi_sector >= maxsector))
2980                return;
2981
2982        maxsector -= bio->bi_iter.bi_sector;
2983        if (likely((bio->bi_iter.bi_size >> 9) <= maxsector))
2984                return;
2985
2986        /* Uhhuh. We've got a bio that straddles the device size! */
2987        truncated_bytes = bio->bi_iter.bi_size - (maxsector << 9);
2988
2989        /* Truncate the bio.. */
2990        bio->bi_iter.bi_size -= truncated_bytes;
2991        bvec->bv_len -= truncated_bytes;
2992
2993        /* ..and clear the end of the buffer for reads */
2994        if ((rw & RW_MASK) == READ) {
2995                zero_user(bvec->bv_page, bvec->bv_offset + bvec->bv_len,
2996                                truncated_bytes);
2997        }
2998}
2999
3000int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
3001{
3002        struct bio *bio;
3003        int ret = 0;
3004
3005        BUG_ON(!buffer_locked(bh));
3006        BUG_ON(!buffer_mapped(bh));
3007        BUG_ON(!bh->b_end_io);
3008        BUG_ON(buffer_delay(bh));
3009        BUG_ON(buffer_unwritten(bh));
3010
3011        /*
3012         * Only clear out a write error when rewriting
3013         */
3014        if (test_set_buffer_req(bh) && (rw & WRITE))
3015                clear_buffer_write_io_error(bh);
3016
3017        /*
3018         * from here on down, it's all bio -- do the initial mapping,
3019         * submit_bio -> generic_make_request may further map this bio around
3020         */
3021        bio = bio_alloc(GFP_NOIO, 1);
3022
3023        bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
3024        bio->bi_bdev = bh->b_bdev;
3025        bio->bi_io_vec[0].bv_page = bh->b_page;
3026        bio->bi_io_vec[0].bv_len = bh->b_size;
3027        bio->bi_io_vec[0].bv_offset = bh_offset(bh);
3028
3029        bio->bi_vcnt = 1;
3030        bio->bi_iter.bi_size = bh->b_size;
3031
3032        bio->bi_end_io = end_bio_bh_io_sync;
3033        bio->bi_private = bh;
3034        bio->bi_flags |= bio_flags;
3035
3036        /* Take care of bh's that straddle the end of the device */
3037        guard_bio_eod(rw, bio);
3038
3039        if (buffer_meta(bh))
3040                rw |= REQ_META;
3041        if (buffer_prio(bh))
3042                rw |= REQ_PRIO;
3043
3044        bio_get(bio);
3045        submit_bio(rw, bio);
3046
3047        if (bio_flagged(bio, BIO_EOPNOTSUPP))
3048                ret = -EOPNOTSUPP;
3049
3050        bio_put(bio);
3051        return ret;
3052}
3053EXPORT_SYMBOL_GPL(_submit_bh);
3054
3055int submit_bh(int rw, struct buffer_head *bh)
3056{
3057        return _submit_bh(rw, bh, 0);
3058}
3059EXPORT_SYMBOL(submit_bh);
3060
3061/**
3062 * ll_rw_block: low-level access to block devices (DEPRECATED)
3063 * @rw: whether to %READ or %WRITE or maybe %READA (readahead)
3064 * @nr: number of &struct buffer_heads in the array
3065 * @bhs: array of pointers to &struct buffer_head
3066 *
3067 * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
3068 * requests an I/O operation on them, either a %READ or a %WRITE.  The third
3069 * %READA option is described in the documentation for generic_make_request()
3070 * which ll_rw_block() calls.
3071 *
3072 * This function drops any buffer that it cannot get a lock on (with the
3073 * BH_Lock state bit), any buffer that appears to be clean when doing a write
3074 * request, and any buffer that appears to be up-to-date when doing read
3075 * request.  Further it marks as clean buffers that are processed for
3076 * writing (the buffer cache won't assume that they are actually clean
3077 * until the buffer gets unlocked).
3078 *
3079 * ll_rw_block sets b_end_io to simple completion handler that marks
3080 * the buffer up-to-date (if appropriate), unlocks the buffer and wakes
3081 * any waiters. 
3082 *
3083 * All of the buffers must be for the same device, and must also be a
3084 * multiple of the current approved size for the device.
3085 */
3086void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
3087{
3088        int i;
3089
3090        for (i = 0; i < nr; i++) {
3091                struct buffer_head *bh = bhs[i];
3092
3093                if (!trylock_buffer(bh))
3094                        continue;
3095                if (rw == WRITE) {
3096                        if (test_clear_buffer_dirty(bh)) {
3097                                bh->b_end_io = end_buffer_write_sync;
3098                                get_bh(bh);
3099                                submit_bh(WRITE, bh);
3100                                continue;
3101                        }
3102                } else {
3103                        if (!buffer_uptodate(bh)) {
3104                                bh->b_end_io = end_buffer_read_sync;
3105                                get_bh(bh);
3106                                submit_bh(rw, bh);
3107                                continue;
3108                        }
3109                }
3110                unlock_buffer(bh);
3111        }
3112}
3113EXPORT_SYMBOL(ll_rw_block);
3114
3115void write_dirty_buffer(struct buffer_head *bh, int rw)
3116{
3117        lock_buffer(bh);
3118        if (!test_clear_buffer_dirty(bh)) {
3119                unlock_buffer(bh);
3120                return;
3121        }
3122        bh->b_end_io = end_buffer_write_sync;
3123        get_bh(bh);
3124        submit_bh(rw, bh);
3125}
3126EXPORT_SYMBOL(write_dirty_buffer);
3127
3128/*
3129 * For a data-integrity writeout, we need to wait upon any in-progress I/O
3130 * and then start new I/O and then wait upon it.  The caller must have a ref on
3131 * the buffer_head.
3132 */
3133int __sync_dirty_buffer(struct buffer_head *bh, int rw)
3134{
3135        int ret = 0;
3136
3137        WARN_ON(atomic_read(&bh->b_count) < 1);
3138        lock_buffer(bh);
3139        if (test_clear_buffer_dirty(bh)) {
3140                get_bh(bh);
3141                bh->b_end_io = end_buffer_write_sync;
3142                ret = submit_bh(rw, bh);
3143                wait_on_buffer(bh);
3144                if (!ret && !buffer_uptodate(bh))
3145                        ret = -EIO;
3146        } else {
3147                unlock_buffer(bh);
3148        }
3149        return ret;
3150}
3151EXPORT_SYMBOL(__sync_dirty_buffer);
3152
3153int sync_dirty_buffer(struct buffer_head *bh)
3154{
3155        return __sync_dirty_buffer(bh, WRITE_SYNC);
3156}
3157EXPORT_SYMBOL(sync_dirty_buffer);
3158
3159/*
3160 * try_to_free_buffers() checks if all the buffers on this particular page
3161 * are unused, and releases them if so.
3162 *
3163 * Exclusion against try_to_free_buffers may be obtained by either
3164 * locking the page or by holding its mapping's private_lock.
3165 *
3166 * If the page is dirty but all the buffers are clean then we need to
3167 * be sure to mark the page clean as well.  This is because the page
3168 * may be against a block device, and a later reattachment of buffers
3169 * to a dirty page will set *all* buffers dirty.  Which would corrupt
3170 * filesystem data on the same device.
3171 *
3172 * The same applies to regular filesystem pages: if all the buffers are
3173 * clean then we set the page clean and proceed.  To do that, we require
3174 * total exclusion from __set_page_dirty_buffers().  That is obtained with
3175 * private_lock.
3176 *
3177 * try_to_free_buffers() is non-blocking.
3178 */
3179static inline int buffer_busy(struct buffer_head *bh)
3180{
3181        return atomic_read(&bh->b_count) |
3182                (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
3183}
3184
3185static int
3186drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
3187{
3188        struct buffer_head *head = page_buffers(page);
3189        struct buffer_head *bh;
3190
3191        bh = head;
3192        do {
3193                if (buffer_write_io_error(bh) && page->mapping)
3194                        set_bit(AS_EIO, &page->mapping->flags);
3195                if (buffer_busy(bh))
3196                        goto failed;
3197                bh = bh->b_this_page;
3198        } while (bh != head);
3199
3200        do {
3201                struct buffer_head *next = bh->b_this_page;
3202
3203                if (bh->b_assoc_map)
3204                        __remove_assoc_queue(bh);
3205                bh = next;
3206        } while (bh != head);
3207        *buffers_to_free = head;
3208        __clear_page_buffers(page);
3209        return 1;
3210failed:
3211        return 0;
3212}
3213
3214int try_to_free_buffers(struct page *page)
3215{
3216        struct address_space * const mapping = page->mapping;
3217        struct buffer_head *buffers_to_free = NULL;
3218        int ret = 0;
3219
3220        BUG_ON(!PageLocked(page));
3221        if (PageWriteback(page))
3222                return 0;
3223
3224        if (mapping == NULL) {          /* can this still happen? */
3225                ret = drop_buffers(page, &buffers_to_free);
3226                goto out;
3227        }
3228
3229        spin_lock(&mapping->private_lock);
3230        ret = drop_buffers(page, &buffers_to_free);
3231
3232        /*
3233         * If the filesystem writes its buffers by hand (eg ext3)
3234         * then we can have clean buffers against a dirty page.  We
3235         * clean the page here; otherwise the VM will never notice
3236         * that the filesystem did any IO at all.
3237         *
3238         * Also, during truncate, discard_buffer will have marked all
3239         * the page's buffers clean.  We discover that here and clean
3240         * the page also.
3241         *
3242         * private_lock must be held over this entire operation in order
3243         * to synchronise against __set_page_dirty_buffers and prevent the
3244         * dirty bit from being lost.
3245         */
3246        if (ret)
3247                cancel_dirty_page(page, PAGE_CACHE_SIZE);
3248        spin_unlock(&mapping->private_lock);
3249out:
3250        if (buffers_to_free) {
3251                struct buffer_head *bh = buffers_to_free;
3252
3253                do {
3254                        struct buffer_head *next = bh->b_this_page;
3255                        free_buffer_head(bh);
3256                        bh = next;
3257                } while (bh != buffers_to_free);
3258        }
3259        return ret;
3260}
3261EXPORT_SYMBOL(try_to_free_buffers);
3262
3263/*
3264 * There are no bdflush tunables left.  But distributions are
3265 * still running obsolete flush daemons, so we terminate them here.
3266 *
3267 * Use of bdflush() is deprecated and will be removed in a future kernel.
3268 * The `flush-X' kernel threads fully replace bdflush daemons and this call.
3269 */
3270SYSCALL_DEFINE2(bdflush, int, func, long, data)
3271{
3272        static int msg_count;
3273
3274        if (!capable(CAP_SYS_ADMIN))
3275                return -EPERM;
3276
3277        if (msg_count < 5) {
3278                msg_count++;
3279                printk(KERN_INFO
3280                        "warning: process `%s' used the obsolete bdflush"
3281                        " system call\n", current->comm);
3282                printk(KERN_INFO "Fix your initscripts?\n");
3283        }
3284
3285        if (func == 1)
3286                do_exit(0);
3287        return 0;
3288}
3289
3290/*
3291 * Buffer-head allocation
3292 */
3293static struct kmem_cache *bh_cachep __read_mostly;
3294
3295/*
3296 * Once the number of bh's in the machine exceeds this level, we start
3297 * stripping them in writeback.
3298 */
3299static unsigned long max_buffer_heads;
3300
3301int buffer_heads_over_limit;
3302
3303struct bh_accounting {
3304        int nr;                 /* Number of live bh's */
3305        int ratelimit;          /* Limit cacheline bouncing */
3306};
3307
3308static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
3309
3310static void recalc_bh_state(void)
3311{
3312        int i;
3313        int tot = 0;
3314
3315        if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096)
3316                return;
3317        __this_cpu_write(bh_accounting.ratelimit, 0);
3318        for_each_online_cpu(i)
3319                tot += per_cpu(bh_accounting, i).nr;
3320        buffer_heads_over_limit = (tot > max_buffer_heads);
3321}
3322
3323struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
3324{
3325        struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
3326        if (ret) {
3327                INIT_LIST_HEAD(&ret->b_assoc_buffers);
3328                preempt_disable();
3329                __this_cpu_inc(bh_accounting.nr);
3330                recalc_bh_state();
3331                preempt_enable();
3332        }
3333        return ret;
3334}
3335EXPORT_SYMBOL(alloc_buffer_head);
3336
3337void free_buffer_head(struct buffer_head *bh)
3338{
3339        BUG_ON(!list_empty(&bh->b_assoc_buffers));
3340        kmem_cache_free(bh_cachep, bh);
3341        preempt_disable();
3342        __this_cpu_dec(bh_accounting.nr);
3343        recalc_bh_state();
3344        preempt_enable();
3345}
3346EXPORT_SYMBOL(free_buffer_head);
3347
3348static void buffer_exit_cpu(int cpu)
3349{
3350        int i;
3351        struct bh_lru *b = &per_cpu(bh_lrus, cpu);
3352
3353        for (i = 0; i < BH_LRU_SIZE; i++) {
3354                brelse(b->bhs[i]);
3355                b->bhs[i] = NULL;
3356        }
3357        this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
3358        per_cpu(bh_accounting, cpu).nr = 0;
3359}
3360
3361static int buffer_cpu_notify(struct notifier_block *self,
3362                              unsigned long action, void *hcpu)
3363{
3364        if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
3365                buffer_exit_cpu((unsigned long)hcpu);
3366        return NOTIFY_OK;
3367}
3368
3369/**
3370 * bh_uptodate_or_lock - Test whether the buffer is uptodate
3371 * @bh: struct buffer_head
3372 *
3373 * Return true if the buffer is up-to-date and false,
3374 * with the buffer locked, if not.
3375 */
3376int bh_uptodate_or_lock(struct buffer_head *bh)
3377{
3378        if (!buffer_uptodate(bh)) {
3379                lock_buffer(bh);
3380                if (!buffer_uptodate(bh))
3381                        return 0;
3382                unlock_buffer(bh);
3383        }
3384        return 1;
3385}
3386EXPORT_SYMBOL(bh_uptodate_or_lock);
3387
3388/**
3389 * bh_submit_read - Submit a locked buffer for reading
3390 * @bh: struct buffer_head
3391 *
3392 * Returns zero on success and -EIO on error.
3393 */
3394int bh_submit_read(struct buffer_head *bh)
3395{
3396        BUG_ON(!buffer_locked(bh));
3397
3398        if (buffer_uptodate(bh)) {
3399                unlock_buffer(bh);
3400                return 0;
3401        }
3402
3403        get_bh(bh);
3404        bh->b_end_io = end_buffer_read_sync;
3405        submit_bh(READ, bh);
3406        wait_on_buffer(bh);
3407        if (buffer_uptodate(bh))
3408                return 0;
3409        return -EIO;
3410}
3411EXPORT_SYMBOL(bh_submit_read);
3412
3413void __init buffer_init(void)
3414{
3415        unsigned long nrpages;
3416
3417        bh_cachep = kmem_cache_create("buffer_head",
3418                        sizeof(struct buffer_head), 0,
3419                                (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
3420                                SLAB_MEM_SPREAD),
3421                                NULL);
3422
3423        /*
3424         * Limit the bh occupancy to 10% of ZONE_NORMAL
3425         */
3426        nrpages = (nr_free_buffer_pages() * 10) / 100;
3427        max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3428        hotcpu_notifier(buffer_cpu_notify, 0);
3429}
3430