linux/fs/buffer.c
<<
>>
Prefs
   1/*
   2 *  linux/fs/buffer.c
   3 *
   4 *  Copyright (C) 1991, 1992, 2002  Linus Torvalds
   5 */
   6
   7/*
   8 * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
   9 *
  10 * Removed a lot of unnecessary code and simplified things now that
  11 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
  12 *
  13 * Speed up hash, lru, and free list operations.  Use gfp() for allocating
  14 * hash table, use SLAB cache for buffer heads. SMP threading.  -DaveM
  15 *
  16 * Added 32k buffer block sizes - these are required older ARM systems. - RMK
  17 *
  18 * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
  19 */
  20
  21#include <linux/kernel.h>
  22#include <linux/syscalls.h>
  23#include <linux/fs.h>
  24#include <linux/mm.h>
  25#include <linux/percpu.h>
  26#include <linux/slab.h>
  27#include <linux/capability.h>
  28#include <linux/blkdev.h>
  29#include <linux/file.h>
  30#include <linux/quotaops.h>
  31#include <linux/highmem.h>
  32#include <linux/export.h>
  33#include <linux/backing-dev.h>
  34#include <linux/writeback.h>
  35#include <linux/hash.h>
  36#include <linux/suspend.h>
  37#include <linux/buffer_head.h>
  38#include <linux/task_io_accounting_ops.h>
  39#include <linux/bio.h>
  40#include <linux/notifier.h>
  41#include <linux/cpu.h>
  42#include <linux/bitops.h>
  43#include <linux/mpage.h>
  44#include <linux/bit_spinlock.h>
  45#include <trace/events/block.h>
  46
  47static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
  48static int submit_bh_wbc(int rw, struct buffer_head *bh,
  49                         unsigned long bio_flags,
  50                         struct writeback_control *wbc);
  51
  52#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
  53
  54void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
  55{
  56        bh->b_end_io = handler;
  57        bh->b_private = private;
  58}
  59EXPORT_SYMBOL(init_buffer);
  60
  61inline void touch_buffer(struct buffer_head *bh)
  62{
  63        trace_block_touch_buffer(bh);
  64        mark_page_accessed(bh->b_page);
  65}
  66EXPORT_SYMBOL(touch_buffer);
  67
  68void __lock_buffer(struct buffer_head *bh)
  69{
  70        wait_on_bit_lock_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
  71}
  72EXPORT_SYMBOL(__lock_buffer);
  73
  74void unlock_buffer(struct buffer_head *bh)
  75{
  76        clear_bit_unlock(BH_Lock, &bh->b_state);
  77        smp_mb__after_atomic();
  78        wake_up_bit(&bh->b_state, BH_Lock);
  79}
  80EXPORT_SYMBOL(unlock_buffer);
  81
  82/*
  83 * Returns if the page has dirty or writeback buffers. If all the buffers
  84 * are unlocked and clean then the PageDirty information is stale. If
  85 * any of the pages are locked, it is assumed they are locked for IO.
  86 */
  87void buffer_check_dirty_writeback(struct page *page,
  88                                     bool *dirty, bool *writeback)
  89{
  90        struct buffer_head *head, *bh;
  91        *dirty = false;
  92        *writeback = false;
  93
  94        BUG_ON(!PageLocked(page));
  95
  96        if (!page_has_buffers(page))
  97                return;
  98
  99        if (PageWriteback(page))
 100                *writeback = true;
 101
 102        head = page_buffers(page);
 103        bh = head;
 104        do {
 105                if (buffer_locked(bh))
 106                        *writeback = true;
 107
 108                if (buffer_dirty(bh))
 109                        *dirty = true;
 110
 111                bh = bh->b_this_page;
 112        } while (bh != head);
 113}
 114EXPORT_SYMBOL(buffer_check_dirty_writeback);
 115
 116/*
 117 * Block until a buffer comes unlocked.  This doesn't stop it
 118 * from becoming locked again - you have to lock it yourself
 119 * if you want to preserve its state.
 120 */
 121void __wait_on_buffer(struct buffer_head * bh)
 122{
 123        wait_on_bit_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
 124}
 125EXPORT_SYMBOL(__wait_on_buffer);
 126
 127static void
 128__clear_page_buffers(struct page *page)
 129{
 130        ClearPagePrivate(page);
 131        set_page_private(page, 0);
 132        page_cache_release(page);
 133}
 134
 135static void buffer_io_error(struct buffer_head *bh, char *msg)
 136{
 137        if (!test_bit(BH_Quiet, &bh->b_state))
 138                printk_ratelimited(KERN_ERR
 139                        "Buffer I/O error on dev %pg, logical block %llu%s\n",
 140                        bh->b_bdev, (unsigned long long)bh->b_blocknr, msg);
 141}
 142
 143/*
 144 * End-of-IO handler helper function which does not touch the bh after
 145 * unlocking it.
 146 * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
 147 * a race there is benign: unlock_buffer() only use the bh's address for
 148 * hashing after unlocking the buffer, so it doesn't actually touch the bh
 149 * itself.
 150 */
 151static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
 152{
 153        if (uptodate) {
 154                set_buffer_uptodate(bh);
 155        } else {
 156                /* This happens, due to failed READA attempts. */
 157                clear_buffer_uptodate(bh);
 158        }
 159        unlock_buffer(bh);
 160}
 161
 162/*
 163 * Default synchronous end-of-IO handler..  Just mark it up-to-date and
 164 * unlock the buffer. This is what ll_rw_block uses too.
 165 */
 166void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
 167{
 168        __end_buffer_read_notouch(bh, uptodate);
 169        put_bh(bh);
 170}
 171EXPORT_SYMBOL(end_buffer_read_sync);
 172
 173void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 174{
 175        if (uptodate) {
 176                set_buffer_uptodate(bh);
 177        } else {
 178                buffer_io_error(bh, ", lost sync page write");
 179                set_buffer_write_io_error(bh);
 180                clear_buffer_uptodate(bh);
 181        }
 182        unlock_buffer(bh);
 183        put_bh(bh);
 184}
 185EXPORT_SYMBOL(end_buffer_write_sync);
 186
 187/*
 188 * Various filesystems appear to want __find_get_block to be non-blocking.
 189 * But it's the page lock which protects the buffers.  To get around this,
 190 * we get exclusion from try_to_free_buffers with the blockdev mapping's
 191 * private_lock.
 192 *
 193 * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
 194 * may be quite high.  This code could TryLock the page, and if that
 195 * succeeds, there is no need to take private_lock. (But if
 196 * private_lock is contended then so is mapping->tree_lock).
 197 */
 198static struct buffer_head *
 199__find_get_block_slow(struct block_device *bdev, sector_t block)
 200{
 201        struct inode *bd_inode = bdev->bd_inode;
 202        struct address_space *bd_mapping = bd_inode->i_mapping;
 203        struct buffer_head *ret = NULL;
 204        pgoff_t index;
 205        struct buffer_head *bh;
 206        struct buffer_head *head;
 207        struct page *page;
 208        int all_mapped = 1;
 209
 210        index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
 211        page = find_get_page_flags(bd_mapping, index, FGP_ACCESSED);
 212        if (!page)
 213                goto out;
 214
 215        spin_lock(&bd_mapping->private_lock);
 216        if (!page_has_buffers(page))
 217                goto out_unlock;
 218        head = page_buffers(page);
 219        bh = head;
 220        do {
 221                if (!buffer_mapped(bh))
 222                        all_mapped = 0;
 223                else if (bh->b_blocknr == block) {
 224                        ret = bh;
 225                        get_bh(bh);
 226                        goto out_unlock;
 227                }
 228                bh = bh->b_this_page;
 229        } while (bh != head);
 230
 231        /* we might be here because some of the buffers on this page are
 232         * not mapped.  This is due to various races between
 233         * file io on the block device and getblk.  It gets dealt with
 234         * elsewhere, don't buffer_error if we had some unmapped buffers
 235         */
 236        if (all_mapped) {
 237                printk("__find_get_block_slow() failed. "
 238                        "block=%llu, b_blocknr=%llu\n",
 239                        (unsigned long long)block,
 240                        (unsigned long long)bh->b_blocknr);
 241                printk("b_state=0x%08lx, b_size=%zu\n",
 242                        bh->b_state, bh->b_size);
 243                printk("device %pg blocksize: %d\n", bdev,
 244                        1 << bd_inode->i_blkbits);
 245        }
 246out_unlock:
 247        spin_unlock(&bd_mapping->private_lock);
 248        page_cache_release(page);
 249out:
 250        return ret;
 251}
 252
 253/*
 254 * Kick the writeback threads then try to free up some ZONE_NORMAL memory.
 255 */
 256static void free_more_memory(void)
 257{
 258        struct zone *zone;
 259        int nid;
 260
 261        wakeup_flusher_threads(1024, WB_REASON_FREE_MORE_MEM);
 262        yield();
 263
 264        for_each_online_node(nid) {
 265                (void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
 266                                                gfp_zone(GFP_NOFS), NULL,
 267                                                &zone);
 268                if (zone)
 269                        try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
 270                                                GFP_NOFS, NULL);
 271        }
 272}
 273
 274/*
 275 * I/O completion handler for block_read_full_page() - pages
 276 * which come unlocked at the end of I/O.
 277 */
 278static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 279{
 280        unsigned long flags;
 281        struct buffer_head *first;
 282        struct buffer_head *tmp;
 283        struct page *page;
 284        int page_uptodate = 1;
 285
 286        BUG_ON(!buffer_async_read(bh));
 287
 288        page = bh->b_page;
 289        if (uptodate) {
 290                set_buffer_uptodate(bh);
 291        } else {
 292                clear_buffer_uptodate(bh);
 293                buffer_io_error(bh, ", async page read");
 294                SetPageError(page);
 295        }
 296
 297        /*
 298         * Be _very_ careful from here on. Bad things can happen if
 299         * two buffer heads end IO at almost the same time and both
 300         * decide that the page is now completely done.
 301         */
 302        first = page_buffers(page);
 303        local_irq_save(flags);
 304        bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
 305        clear_buffer_async_read(bh);
 306        unlock_buffer(bh);
 307        tmp = bh;
 308        do {
 309                if (!buffer_uptodate(tmp))
 310                        page_uptodate = 0;
 311                if (buffer_async_read(tmp)) {
 312                        BUG_ON(!buffer_locked(tmp));
 313                        goto still_busy;
 314                }
 315                tmp = tmp->b_this_page;
 316        } while (tmp != bh);
 317        bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 318        local_irq_restore(flags);
 319
 320        /*
 321         * If none of the buffers had errors and they are all
 322         * uptodate then we can set the page uptodate.
 323         */
 324        if (page_uptodate && !PageError(page))
 325                SetPageUptodate(page);
 326        unlock_page(page);
 327        return;
 328
 329still_busy:
 330        bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 331        local_irq_restore(flags);
 332        return;
 333}
 334
 335/*
 336 * Completion handler for block_write_full_page() - pages which are unlocked
 337 * during I/O, and which have PageWriteback cleared upon I/O completion.
 338 */
 339void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 340{
 341        unsigned long flags;
 342        struct buffer_head *first;
 343        struct buffer_head *tmp;
 344        struct page *page;
 345
 346        BUG_ON(!buffer_async_write(bh));
 347
 348        page = bh->b_page;
 349        if (uptodate) {
 350                set_buffer_uptodate(bh);
 351        } else {
 352                buffer_io_error(bh, ", lost async page write");
 353                set_bit(AS_EIO, &page->mapping->flags);
 354                set_buffer_write_io_error(bh);
 355                clear_buffer_uptodate(bh);
 356                SetPageError(page);
 357        }
 358
 359        first = page_buffers(page);
 360        local_irq_save(flags);
 361        bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
 362
 363        clear_buffer_async_write(bh);
 364        unlock_buffer(bh);
 365        tmp = bh->b_this_page;
 366        while (tmp != bh) {
 367                if (buffer_async_write(tmp)) {
 368                        BUG_ON(!buffer_locked(tmp));
 369                        goto still_busy;
 370                }
 371                tmp = tmp->b_this_page;
 372        }
 373        bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 374        local_irq_restore(flags);
 375        end_page_writeback(page);
 376        return;
 377
 378still_busy:
 379        bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 380        local_irq_restore(flags);
 381        return;
 382}
 383EXPORT_SYMBOL(end_buffer_async_write);
 384
 385/*
 386 * If a page's buffers are under async readin (end_buffer_async_read
 387 * completion) then there is a possibility that another thread of
 388 * control could lock one of the buffers after it has completed
 389 * but while some of the other buffers have not completed.  This
 390 * locked buffer would confuse end_buffer_async_read() into not unlocking
 391 * the page.  So the absence of BH_Async_Read tells end_buffer_async_read()
 392 * that this buffer is not under async I/O.
 393 *
 394 * The page comes unlocked when it has no locked buffer_async buffers
 395 * left.
 396 *
 397 * PageLocked prevents anyone starting new async I/O reads any of
 398 * the buffers.
 399 *
 400 * PageWriteback is used to prevent simultaneous writeout of the same
 401 * page.
 402 *
 403 * PageLocked prevents anyone from starting writeback of a page which is
 404 * under read I/O (PageWriteback is only ever set against a locked page).
 405 */
 406static void mark_buffer_async_read(struct buffer_head *bh)
 407{
 408        bh->b_end_io = end_buffer_async_read;
 409        set_buffer_async_read(bh);
 410}
 411
 412static void mark_buffer_async_write_endio(struct buffer_head *bh,
 413                                          bh_end_io_t *handler)
 414{
 415        bh->b_end_io = handler;
 416        set_buffer_async_write(bh);
 417}
 418
 419void mark_buffer_async_write(struct buffer_head *bh)
 420{
 421        mark_buffer_async_write_endio(bh, end_buffer_async_write);
 422}
 423EXPORT_SYMBOL(mark_buffer_async_write);
 424
 425
 426/*
 427 * fs/buffer.c contains helper functions for buffer-backed address space's
 428 * fsync functions.  A common requirement for buffer-based filesystems is
 429 * that certain data from the backing blockdev needs to be written out for
 430 * a successful fsync().  For example, ext2 indirect blocks need to be
 431 * written back and waited upon before fsync() returns.
 432 *
 433 * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
 434 * inode_has_buffers() and invalidate_inode_buffers() are provided for the
 435 * management of a list of dependent buffers at ->i_mapping->private_list.
 436 *
 437 * Locking is a little subtle: try_to_free_buffers() will remove buffers
 438 * from their controlling inode's queue when they are being freed.  But
 439 * try_to_free_buffers() will be operating against the *blockdev* mapping
 440 * at the time, not against the S_ISREG file which depends on those buffers.
 441 * So the locking for private_list is via the private_lock in the address_space
 442 * which backs the buffers.  Which is different from the address_space 
 443 * against which the buffers are listed.  So for a particular address_space,
 444 * mapping->private_lock does *not* protect mapping->private_list!  In fact,
 445 * mapping->private_list will always be protected by the backing blockdev's
 446 * ->private_lock.
 447 *
 448 * Which introduces a requirement: all buffers on an address_space's
 449 * ->private_list must be from the same address_space: the blockdev's.
 450 *
 451 * address_spaces which do not place buffers at ->private_list via these
 452 * utility functions are free to use private_lock and private_list for
 453 * whatever they want.  The only requirement is that list_empty(private_list)
 454 * be true at clear_inode() time.
 455 *
 456 * FIXME: clear_inode should not call invalidate_inode_buffers().  The
 457 * filesystems should do that.  invalidate_inode_buffers() should just go
 458 * BUG_ON(!list_empty).
 459 *
 460 * FIXME: mark_buffer_dirty_inode() is a data-plane operation.  It should
 461 * take an address_space, not an inode.  And it should be called
 462 * mark_buffer_dirty_fsync() to clearly define why those buffers are being
 463 * queued up.
 464 *
 465 * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
 466 * list if it is already on a list.  Because if the buffer is on a list,
 467 * it *must* already be on the right one.  If not, the filesystem is being
 468 * silly.  This will save a ton of locking.  But first we have to ensure
 469 * that buffers are taken *off* the old inode's list when they are freed
 470 * (presumably in truncate).  That requires careful auditing of all
 471 * filesystems (do it inside bforget()).  It could also be done by bringing
 472 * b_inode back.
 473 */
 474
 475/*
 476 * The buffer's backing address_space's private_lock must be held
 477 */
 478static void __remove_assoc_queue(struct buffer_head *bh)
 479{
 480        list_del_init(&bh->b_assoc_buffers);
 481        WARN_ON(!bh->b_assoc_map);
 482        if (buffer_write_io_error(bh))
 483                set_bit(AS_EIO, &bh->b_assoc_map->flags);
 484        bh->b_assoc_map = NULL;
 485}
 486
 487int inode_has_buffers(struct inode *inode)
 488{
 489        return !list_empty(&inode->i_data.private_list);
 490}
 491
 492/*
 493 * osync is designed to support O_SYNC io.  It waits synchronously for
 494 * all already-submitted IO to complete, but does not queue any new
 495 * writes to the disk.
 496 *
 497 * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
 498 * you dirty the buffers, and then use osync_inode_buffers to wait for
 499 * completion.  Any other dirty buffers which are not yet queued for
 500 * write will not be flushed to disk by the osync.
 501 */
 502static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
 503{
 504        struct buffer_head *bh;
 505        struct list_head *p;
 506        int err = 0;
 507
 508        spin_lock(lock);
 509repeat:
 510        list_for_each_prev(p, list) {
 511                bh = BH_ENTRY(p);
 512                if (buffer_locked(bh)) {
 513                        get_bh(bh);
 514                        spin_unlock(lock);
 515                        wait_on_buffer(bh);
 516                        if (!buffer_uptodate(bh))
 517                                err = -EIO;
 518                        brelse(bh);
 519                        spin_lock(lock);
 520                        goto repeat;
 521                }
 522        }
 523        spin_unlock(lock);
 524        return err;
 525}
 526
 527static void do_thaw_one(struct super_block *sb, void *unused)
 528{
 529        while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
 530                printk(KERN_WARNING "Emergency Thaw on %pg\n", sb->s_bdev);
 531}
 532
 533static void do_thaw_all(struct work_struct *work)
 534{
 535        iterate_supers(do_thaw_one, NULL);
 536        kfree(work);
 537        printk(KERN_WARNING "Emergency Thaw complete\n");
 538}
 539
 540/**
 541 * emergency_thaw_all -- forcibly thaw every frozen filesystem
 542 *
 543 * Used for emergency unfreeze of all filesystems via SysRq
 544 */
 545void emergency_thaw_all(void)
 546{
 547        struct work_struct *work;
 548
 549        work = kmalloc(sizeof(*work), GFP_ATOMIC);
 550        if (work) {
 551                INIT_WORK(work, do_thaw_all);
 552                schedule_work(work);
 553        }
 554}
 555
 556/**
 557 * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
 558 * @mapping: the mapping which wants those buffers written
 559 *
 560 * Starts I/O against the buffers at mapping->private_list, and waits upon
 561 * that I/O.
 562 *
 563 * Basically, this is a convenience function for fsync().
 564 * @mapping is a file or directory which needs those buffers to be written for
 565 * a successful fsync().
 566 */
 567int sync_mapping_buffers(struct address_space *mapping)
 568{
 569        struct address_space *buffer_mapping = mapping->private_data;
 570
 571        if (buffer_mapping == NULL || list_empty(&mapping->private_list))
 572                return 0;
 573
 574        return fsync_buffers_list(&buffer_mapping->private_lock,
 575                                        &mapping->private_list);
 576}
 577EXPORT_SYMBOL(sync_mapping_buffers);
 578
 579/*
 580 * Called when we've recently written block `bblock', and it is known that
 581 * `bblock' was for a buffer_boundary() buffer.  This means that the block at
 582 * `bblock + 1' is probably a dirty indirect block.  Hunt it down and, if it's
 583 * dirty, schedule it for IO.  So that indirects merge nicely with their data.
 584 */
 585void write_boundary_block(struct block_device *bdev,
 586                        sector_t bblock, unsigned blocksize)
 587{
 588        struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
 589        if (bh) {
 590                if (buffer_dirty(bh))
 591                        ll_rw_block(WRITE, 1, &bh);
 592                put_bh(bh);
 593        }
 594}
 595
 596void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
 597{
 598        struct address_space *mapping = inode->i_mapping;
 599        struct address_space *buffer_mapping = bh->b_page->mapping;
 600
 601        mark_buffer_dirty(bh);
 602        if (!mapping->private_data) {
 603                mapping->private_data = buffer_mapping;
 604        } else {
 605                BUG_ON(mapping->private_data != buffer_mapping);
 606        }
 607        if (!bh->b_assoc_map) {
 608                spin_lock(&buffer_mapping->private_lock);
 609                list_move_tail(&bh->b_assoc_buffers,
 610                                &mapping->private_list);
 611                bh->b_assoc_map = mapping;
 612                spin_unlock(&buffer_mapping->private_lock);
 613        }
 614}
 615EXPORT_SYMBOL(mark_buffer_dirty_inode);
 616
 617/*
 618 * Mark the page dirty, and set it dirty in the radix tree, and mark the inode
 619 * dirty.
 620 *
 621 * If warn is true, then emit a warning if the page is not uptodate and has
 622 * not been truncated.
 623 *
 624 * The caller must hold mem_cgroup_begin_page_stat() lock.
 625 */
 626static void __set_page_dirty(struct page *page, struct address_space *mapping,
 627                             struct mem_cgroup *memcg, int warn)
 628{
 629        unsigned long flags;
 630
 631        spin_lock_irqsave(&mapping->tree_lock, flags);
 632        if (page->mapping) {    /* Race with truncate? */
 633                WARN_ON_ONCE(warn && !PageUptodate(page));
 634                account_page_dirtied(page, mapping, memcg);
 635                radix_tree_tag_set(&mapping->page_tree,
 636                                page_index(page), PAGECACHE_TAG_DIRTY);
 637        }
 638        spin_unlock_irqrestore(&mapping->tree_lock, flags);
 639}
 640
 641/*
 642 * Add a page to the dirty page list.
 643 *
 644 * It is a sad fact of life that this function is called from several places
 645 * deeply under spinlocking.  It may not sleep.
 646 *
 647 * If the page has buffers, the uptodate buffers are set dirty, to preserve
 648 * dirty-state coherency between the page and the buffers.  It the page does
 649 * not have buffers then when they are later attached they will all be set
 650 * dirty.
 651 *
 652 * The buffers are dirtied before the page is dirtied.  There's a small race
 653 * window in which a writepage caller may see the page cleanness but not the
 654 * buffer dirtiness.  That's fine.  If this code were to set the page dirty
 655 * before the buffers, a concurrent writepage caller could clear the page dirty
 656 * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
 657 * page on the dirty page list.
 658 *
 659 * We use private_lock to lock against try_to_free_buffers while using the
 660 * page's buffer list.  Also use this to protect against clean buffers being
 661 * added to the page after it was set dirty.
 662 *
 663 * FIXME: may need to call ->reservepage here as well.  That's rather up to the
 664 * address_space though.
 665 */
 666int __set_page_dirty_buffers(struct page *page)
 667{
 668        int newly_dirty;
 669        struct mem_cgroup *memcg;
 670        struct address_space *mapping = page_mapping(page);
 671
 672        if (unlikely(!mapping))
 673                return !TestSetPageDirty(page);
 674
 675        spin_lock(&mapping->private_lock);
 676        if (page_has_buffers(page)) {
 677                struct buffer_head *head = page_buffers(page);
 678                struct buffer_head *bh = head;
 679
 680                do {
 681                        set_buffer_dirty(bh);
 682                        bh = bh->b_this_page;
 683                } while (bh != head);
 684        }
 685        /*
 686         * Use mem_group_begin_page_stat() to keep PageDirty synchronized with
 687         * per-memcg dirty page counters.
 688         */
 689        memcg = mem_cgroup_begin_page_stat(page);
 690        newly_dirty = !TestSetPageDirty(page);
 691        spin_unlock(&mapping->private_lock);
 692
 693        if (newly_dirty)
 694                __set_page_dirty(page, mapping, memcg, 1);
 695
 696        mem_cgroup_end_page_stat(memcg);
 697
 698        if (newly_dirty)
 699                __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 700
 701        return newly_dirty;
 702}
 703EXPORT_SYMBOL(__set_page_dirty_buffers);
 704
 705/*
 706 * Write out and wait upon a list of buffers.
 707 *
 708 * We have conflicting pressures: we want to make sure that all
 709 * initially dirty buffers get waited on, but that any subsequently
 710 * dirtied buffers don't.  After all, we don't want fsync to last
 711 * forever if somebody is actively writing to the file.
 712 *
 713 * Do this in two main stages: first we copy dirty buffers to a
 714 * temporary inode list, queueing the writes as we go.  Then we clean
 715 * up, waiting for those writes to complete.
 716 * 
 717 * During this second stage, any subsequent updates to the file may end
 718 * up refiling the buffer on the original inode's dirty list again, so
 719 * there is a chance we will end up with a buffer queued for write but
 720 * not yet completed on that list.  So, as a final cleanup we go through
 721 * the osync code to catch these locked, dirty buffers without requeuing
 722 * any newly dirty buffers for write.
 723 */
 724static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 725{
 726        struct buffer_head *bh;
 727        struct list_head tmp;
 728        struct address_space *mapping;
 729        int err = 0, err2;
 730        struct blk_plug plug;
 731
 732        INIT_LIST_HEAD(&tmp);
 733        blk_start_plug(&plug);
 734
 735        spin_lock(lock);
 736        while (!list_empty(list)) {
 737                bh = BH_ENTRY(list->next);
 738                mapping = bh->b_assoc_map;
 739                __remove_assoc_queue(bh);
 740                /* Avoid race with mark_buffer_dirty_inode() which does
 741                 * a lockless check and we rely on seeing the dirty bit */
 742                smp_mb();
 743                if (buffer_dirty(bh) || buffer_locked(bh)) {
 744                        list_add(&bh->b_assoc_buffers, &tmp);
 745                        bh->b_assoc_map = mapping;
 746                        if (buffer_dirty(bh)) {
 747                                get_bh(bh);
 748                                spin_unlock(lock);
 749                                /*
 750                                 * Ensure any pending I/O completes so that
 751                                 * write_dirty_buffer() actually writes the
 752                                 * current contents - it is a noop if I/O is
 753                                 * still in flight on potentially older
 754                                 * contents.
 755                                 */
 756                                write_dirty_buffer(bh, WRITE_SYNC);
 757
 758                                /*
 759                                 * Kick off IO for the previous mapping. Note
 760                                 * that we will not run the very last mapping,
 761                                 * wait_on_buffer() will do that for us
 762                                 * through sync_buffer().
 763                                 */
 764                                brelse(bh);
 765                                spin_lock(lock);
 766                        }
 767                }
 768        }
 769
 770        spin_unlock(lock);
 771        blk_finish_plug(&plug);
 772        spin_lock(lock);
 773
 774        while (!list_empty(&tmp)) {
 775                bh = BH_ENTRY(tmp.prev);
 776                get_bh(bh);
 777                mapping = bh->b_assoc_map;
 778                __remove_assoc_queue(bh);
 779                /* Avoid race with mark_buffer_dirty_inode() which does
 780                 * a lockless check and we rely on seeing the dirty bit */
 781                smp_mb();
 782                if (buffer_dirty(bh)) {
 783                        list_add(&bh->b_assoc_buffers,
 784                                 &mapping->private_list);
 785                        bh->b_assoc_map = mapping;
 786                }
 787                spin_unlock(lock);
 788                wait_on_buffer(bh);
 789                if (!buffer_uptodate(bh))
 790                        err = -EIO;
 791                brelse(bh);
 792                spin_lock(lock);
 793        }
 794        
 795        spin_unlock(lock);
 796        err2 = osync_buffers_list(lock, list);
 797        if (err)
 798                return err;
 799        else
 800                return err2;
 801}
 802
 803/*
 804 * Invalidate any and all dirty buffers on a given inode.  We are
 805 * probably unmounting the fs, but that doesn't mean we have already
 806 * done a sync().  Just drop the buffers from the inode list.
 807 *
 808 * NOTE: we take the inode's blockdev's mapping's private_lock.  Which
 809 * assumes that all the buffers are against the blockdev.  Not true
 810 * for reiserfs.
 811 */
 812void invalidate_inode_buffers(struct inode *inode)
 813{
 814        if (inode_has_buffers(inode)) {
 815                struct address_space *mapping = &inode->i_data;
 816                struct list_head *list = &mapping->private_list;
 817                struct address_space *buffer_mapping = mapping->private_data;
 818
 819                spin_lock(&buffer_mapping->private_lock);
 820                while (!list_empty(list))
 821                        __remove_assoc_queue(BH_ENTRY(list->next));
 822                spin_unlock(&buffer_mapping->private_lock);
 823        }
 824}
 825EXPORT_SYMBOL(invalidate_inode_buffers);
 826
 827/*
 828 * Remove any clean buffers from the inode's buffer list.  This is called
 829 * when we're trying to free the inode itself.  Those buffers can pin it.
 830 *
 831 * Returns true if all buffers were removed.
 832 */
 833int remove_inode_buffers(struct inode *inode)
 834{
 835        int ret = 1;
 836
 837        if (inode_has_buffers(inode)) {
 838                struct address_space *mapping = &inode->i_data;
 839                struct list_head *list = &mapping->private_list;
 840                struct address_space *buffer_mapping = mapping->private_data;
 841
 842                spin_lock(&buffer_mapping->private_lock);
 843                while (!list_empty(list)) {
 844                        struct buffer_head *bh = BH_ENTRY(list->next);
 845                        if (buffer_dirty(bh)) {
 846                                ret = 0;
 847                                break;
 848                        }
 849                        __remove_assoc_queue(bh);
 850                }
 851                spin_unlock(&buffer_mapping->private_lock);
 852        }
 853        return ret;
 854}
 855
 856/*
 857 * Create the appropriate buffers when given a page for data area and
 858 * the size of each buffer.. Use the bh->b_this_page linked list to
 859 * follow the buffers created.  Return NULL if unable to create more
 860 * buffers.
 861 *
 862 * The retry flag is used to differentiate async IO (paging, swapping)
 863 * which may not fail from ordinary buffer allocations.
 864 */
 865struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
 866                int retry)
 867{
 868        struct buffer_head *bh, *head;
 869        long offset;
 870
 871try_again:
 872        head = NULL;
 873        offset = PAGE_SIZE;
 874        while ((offset -= size) >= 0) {
 875                bh = alloc_buffer_head(GFP_NOFS);
 876                if (!bh)
 877                        goto no_grow;
 878
 879                bh->b_this_page = head;
 880                bh->b_blocknr = -1;
 881                head = bh;
 882
 883                bh->b_size = size;
 884
 885                /* Link the buffer to its page */
 886                set_bh_page(bh, page, offset);
 887        }
 888        return head;
 889/*
 890 * In case anything failed, we just free everything we got.
 891 */
 892no_grow:
 893        if (head) {
 894                do {
 895                        bh = head;
 896                        head = head->b_this_page;
 897                        free_buffer_head(bh);
 898                } while (head);
 899        }
 900
 901        /*
 902         * Return failure for non-async IO requests.  Async IO requests
 903         * are not allowed to fail, so we have to wait until buffer heads
 904         * become available.  But we don't want tasks sleeping with 
 905         * partially complete buffers, so all were released above.
 906         */
 907        if (!retry)
 908                return NULL;
 909
 910        /* We're _really_ low on memory. Now we just
 911         * wait for old buffer heads to become free due to
 912         * finishing IO.  Since this is an async request and
 913         * the reserve list is empty, we're sure there are 
 914         * async buffer heads in use.
 915         */
 916        free_more_memory();
 917        goto try_again;
 918}
 919EXPORT_SYMBOL_GPL(alloc_page_buffers);
 920
 921static inline void
 922link_dev_buffers(struct page *page, struct buffer_head *head)
 923{
 924        struct buffer_head *bh, *tail;
 925
 926        bh = head;
 927        do {
 928                tail = bh;
 929                bh = bh->b_this_page;
 930        } while (bh);
 931        tail->b_this_page = head;
 932        attach_page_buffers(page, head);
 933}
 934
 935static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size)
 936{
 937        sector_t retval = ~((sector_t)0);
 938        loff_t sz = i_size_read(bdev->bd_inode);
 939
 940        if (sz) {
 941                unsigned int sizebits = blksize_bits(size);
 942                retval = (sz >> sizebits);
 943        }
 944        return retval;
 945}
 946
 947/*
 948 * Initialise the state of a blockdev page's buffers.
 949 */ 
 950static sector_t
 951init_page_buffers(struct page *page, struct block_device *bdev,
 952                        sector_t block, int size)
 953{
 954        struct buffer_head *head = page_buffers(page);
 955        struct buffer_head *bh = head;
 956        int uptodate = PageUptodate(page);
 957        sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode), size);
 958
 959        do {
 960                if (!buffer_mapped(bh)) {
 961                        init_buffer(bh, NULL, NULL);
 962                        bh->b_bdev = bdev;
 963                        bh->b_blocknr = block;
 964                        if (uptodate)
 965                                set_buffer_uptodate(bh);
 966                        if (block < end_block)
 967                                set_buffer_mapped(bh);
 968                }
 969                block++;
 970                bh = bh->b_this_page;
 971        } while (bh != head);
 972
 973        /*
 974         * Caller needs to validate requested block against end of device.
 975         */
 976        return end_block;
 977}
 978
 979/*
 980 * Create the page-cache page that contains the requested block.
 981 *
 982 * This is used purely for blockdev mappings.
 983 */
 984static int
 985grow_dev_page(struct block_device *bdev, sector_t block,
 986              pgoff_t index, int size, int sizebits, gfp_t gfp)
 987{
 988        struct inode *inode = bdev->bd_inode;
 989        struct page *page;
 990        struct buffer_head *bh;
 991        sector_t end_block;
 992        int ret = 0;            /* Will call free_more_memory() */
 993        gfp_t gfp_mask;
 994
 995        gfp_mask = mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS) | gfp;
 996
 997        /*
 998         * XXX: __getblk_slow() can not really deal with failure and
 999         * will endlessly loop on improvised global reclaim.  Prefer
1000         * looping in the allocator rather than here, at least that
1001         * code knows what it's doing.
1002         */
1003        gfp_mask |= __GFP_NOFAIL;
1004
1005        page = find_or_create_page(inode->i_mapping, index, gfp_mask);
1006        if (!page)
1007                return ret;
1008
1009        BUG_ON(!PageLocked(page));
1010
1011        if (page_has_buffers(page)) {
1012                bh = page_buffers(page);
1013                if (bh->b_size == size) {
1014                        end_block = init_page_buffers(page, bdev,
1015                                                (sector_t)index << sizebits,
1016                                                size);
1017                        goto done;
1018                }
1019                if (!try_to_free_buffers(page))
1020                        goto failed;
1021        }
1022
1023        /*
1024         * Allocate some buffers for this page
1025         */
1026        bh = alloc_page_buffers(page, size, 0);
1027        if (!bh)
1028                goto failed;
1029
1030        /*
1031         * Link the page to the buffers and initialise them.  Take the
1032         * lock to be atomic wrt __find_get_block(), which does not
1033         * run under the page lock.
1034         */
1035        spin_lock(&inode->i_mapping->private_lock);
1036        link_dev_buffers(page, bh);
1037        end_block = init_page_buffers(page, bdev, (sector_t)index << sizebits,
1038                        size);
1039        spin_unlock(&inode->i_mapping->private_lock);
1040done:
1041        ret = (block < end_block) ? 1 : -ENXIO;
1042failed:
1043        unlock_page(page);
1044        page_cache_release(page);
1045        return ret;
1046}
1047
1048/*
1049 * Create buffers for the specified block device block's page.  If
1050 * that page was dirty, the buffers are set dirty also.
1051 */
1052static int
1053grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp)
1054{
1055        pgoff_t index;
1056        int sizebits;
1057
1058        sizebits = -1;
1059        do {
1060                sizebits++;
1061        } while ((size << sizebits) < PAGE_SIZE);
1062
1063        index = block >> sizebits;
1064
1065        /*
1066         * Check for a block which wants to lie outside our maximum possible
1067         * pagecache index.  (this comparison is done using sector_t types).
1068         */
1069        if (unlikely(index != block >> sizebits)) {
1070                printk(KERN_ERR "%s: requested out-of-range block %llu for "
1071                        "device %pg\n",
1072                        __func__, (unsigned long long)block,
1073                        bdev);
1074                return -EIO;
1075        }
1076
1077        /* Create a page with the proper size buffers.. */
1078        return grow_dev_page(bdev, block, index, size, sizebits, gfp);
1079}
1080
1081struct buffer_head *
1082__getblk_slow(struct block_device *bdev, sector_t block,
1083             unsigned size, gfp_t gfp)
1084{
1085        /* Size must be multiple of hard sectorsize */
1086        if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
1087                        (size < 512 || size > PAGE_SIZE))) {
1088                printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1089                                        size);
1090                printk(KERN_ERR "logical block size: %d\n",
1091                                        bdev_logical_block_size(bdev));
1092
1093                dump_stack();
1094                return NULL;
1095        }
1096
1097        for (;;) {
1098                struct buffer_head *bh;
1099                int ret;
1100
1101                bh = __find_get_block(bdev, block, size);
1102                if (bh)
1103                        return bh;
1104
1105                ret = grow_buffers(bdev, block, size, gfp);
1106                if (ret < 0)
1107                        return NULL;
1108                if (ret == 0)
1109                        free_more_memory();
1110        }
1111}
1112EXPORT_SYMBOL(__getblk_slow);
1113
1114/*
1115 * The relationship between dirty buffers and dirty pages:
1116 *
1117 * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1118 * the page is tagged dirty in its radix tree.
1119 *
1120 * At all times, the dirtiness of the buffers represents the dirtiness of
1121 * subsections of the page.  If the page has buffers, the page dirty bit is
1122 * merely a hint about the true dirty state.
1123 *
1124 * When a page is set dirty in its entirety, all its buffers are marked dirty
1125 * (if the page has buffers).
1126 *
1127 * When a buffer is marked dirty, its page is dirtied, but the page's other
1128 * buffers are not.
1129 *
1130 * Also.  When blockdev buffers are explicitly read with bread(), they
1131 * individually become uptodate.  But their backing page remains not
1132 * uptodate - even if all of its buffers are uptodate.  A subsequent
1133 * block_read_full_page() against that page will discover all the uptodate
1134 * buffers, will set the page uptodate and will perform no I/O.
1135 */
1136
1137/**
1138 * mark_buffer_dirty - mark a buffer_head as needing writeout
1139 * @bh: the buffer_head to mark dirty
1140 *
1141 * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
1142 * backing page dirty, then tag the page as dirty in its address_space's radix
1143 * tree and then attach the address_space's inode to its superblock's dirty
1144 * inode list.
1145 *
1146 * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
1147 * mapping->tree_lock and mapping->host->i_lock.
1148 */
1149void mark_buffer_dirty(struct buffer_head *bh)
1150{
1151        WARN_ON_ONCE(!buffer_uptodate(bh));
1152
1153        trace_block_dirty_buffer(bh);
1154
1155        /*
1156         * Very *carefully* optimize the it-is-already-dirty case.
1157         *
1158         * Don't let the final "is it dirty" escape to before we
1159         * perhaps modified the buffer.
1160         */
1161        if (buffer_dirty(bh)) {
1162                smp_mb();
1163                if (buffer_dirty(bh))
1164                        return;
1165        }
1166
1167        if (!test_set_buffer_dirty(bh)) {
1168                struct page *page = bh->b_page;
1169                struct address_space *mapping = NULL;
1170                struct mem_cgroup *memcg;
1171
1172                memcg = mem_cgroup_begin_page_stat(page);
1173                if (!TestSetPageDirty(page)) {
1174                        mapping = page_mapping(page);
1175                        if (mapping)
1176                                __set_page_dirty(page, mapping, memcg, 0);
1177                }
1178                mem_cgroup_end_page_stat(memcg);
1179                if (mapping)
1180                        __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
1181        }
1182}
1183EXPORT_SYMBOL(mark_buffer_dirty);
1184
1185/*
1186 * Decrement a buffer_head's reference count.  If all buffers against a page
1187 * have zero reference count, are clean and unlocked, and if the page is clean
1188 * and unlocked then try_to_free_buffers() may strip the buffers from the page
1189 * in preparation for freeing it (sometimes, rarely, buffers are removed from
1190 * a page but it ends up not being freed, and buffers may later be reattached).
1191 */
1192void __brelse(struct buffer_head * buf)
1193{
1194        if (atomic_read(&buf->b_count)) {
1195                put_bh(buf);
1196                return;
1197        }
1198        WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1199}
1200EXPORT_SYMBOL(__brelse);
1201
1202/*
1203 * bforget() is like brelse(), except it discards any
1204 * potentially dirty data.
1205 */
1206void __bforget(struct buffer_head *bh)
1207{
1208        clear_buffer_dirty(bh);
1209        if (bh->b_assoc_map) {
1210                struct address_space *buffer_mapping = bh->b_page->mapping;
1211
1212                spin_lock(&buffer_mapping->private_lock);
1213                list_del_init(&bh->b_assoc_buffers);
1214                bh->b_assoc_map = NULL;
1215                spin_unlock(&buffer_mapping->private_lock);
1216        }
1217        __brelse(bh);
1218}
1219EXPORT_SYMBOL(__bforget);
1220
1221static struct buffer_head *__bread_slow(struct buffer_head *bh)
1222{
1223        lock_buffer(bh);
1224        if (buffer_uptodate(bh)) {
1225                unlock_buffer(bh);
1226                return bh;
1227        } else {
1228                get_bh(bh);
1229                bh->b_end_io = end_buffer_read_sync;
1230                submit_bh(READ, bh);
1231                wait_on_buffer(bh);
1232                if (buffer_uptodate(bh))
1233                        return bh;
1234        }
1235        brelse(bh);
1236        return NULL;
1237}
1238
1239/*
1240 * Per-cpu buffer LRU implementation.  To reduce the cost of __find_get_block().
1241 * The bhs[] array is sorted - newest buffer is at bhs[0].  Buffers have their
1242 * refcount elevated by one when they're in an LRU.  A buffer can only appear
1243 * once in a particular CPU's LRU.  A single buffer can be present in multiple
1244 * CPU's LRUs at the same time.
1245 *
1246 * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1247 * sb_find_get_block().
1248 *
1249 * The LRUs themselves only need locking against invalidate_bh_lrus.  We use
1250 * a local interrupt disable for that.
1251 */
1252
1253#define BH_LRU_SIZE     16
1254
1255struct bh_lru {
1256        struct buffer_head *bhs[BH_LRU_SIZE];
1257};
1258
1259static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1260
1261#ifdef CONFIG_SMP
1262#define bh_lru_lock()   local_irq_disable()
1263#define bh_lru_unlock() local_irq_enable()
1264#else
1265#define bh_lru_lock()   preempt_disable()
1266#define bh_lru_unlock() preempt_enable()
1267#endif
1268
1269static inline void check_irqs_on(void)
1270{
1271#ifdef irqs_disabled
1272        BUG_ON(irqs_disabled());
1273#endif
1274}
1275
1276/*
1277 * The LRU management algorithm is dopey-but-simple.  Sorry.
1278 */
1279static void bh_lru_install(struct buffer_head *bh)
1280{
1281        struct buffer_head *evictee = NULL;
1282
1283        check_irqs_on();
1284        bh_lru_lock();
1285        if (__this_cpu_read(bh_lrus.bhs[0]) != bh) {
1286                struct buffer_head *bhs[BH_LRU_SIZE];
1287                int in;
1288                int out = 0;
1289
1290                get_bh(bh);
1291                bhs[out++] = bh;
1292                for (in = 0; in < BH_LRU_SIZE; in++) {
1293                        struct buffer_head *bh2 =
1294                                __this_cpu_read(bh_lrus.bhs[in]);
1295
1296                        if (bh2 == bh) {
1297                                __brelse(bh2);
1298                        } else {
1299                                if (out >= BH_LRU_SIZE) {
1300                                        BUG_ON(evictee != NULL);
1301                                        evictee = bh2;
1302                                } else {
1303                                        bhs[out++] = bh2;
1304                                }
1305                        }
1306                }
1307                while (out < BH_LRU_SIZE)
1308                        bhs[out++] = NULL;
1309                memcpy(this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs));
1310        }
1311        bh_lru_unlock();
1312
1313        if (evictee)
1314                __brelse(evictee);
1315}
1316
1317/*
1318 * Look up the bh in this cpu's LRU.  If it's there, move it to the head.
1319 */
1320static struct buffer_head *
1321lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
1322{
1323        struct buffer_head *ret = NULL;
1324        unsigned int i;
1325
1326        check_irqs_on();
1327        bh_lru_lock();
1328        for (i = 0; i < BH_LRU_SIZE; i++) {
1329                struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
1330
1331                if (bh && bh->b_blocknr == block && bh->b_bdev == bdev &&
1332                    bh->b_size == size) {
1333                        if (i) {
1334                                while (i) {
1335                                        __this_cpu_write(bh_lrus.bhs[i],
1336                                                __this_cpu_read(bh_lrus.bhs[i - 1]));
1337                                        i--;
1338                                }
1339                                __this_cpu_write(bh_lrus.bhs[0], bh);
1340                        }
1341                        get_bh(bh);
1342                        ret = bh;
1343                        break;
1344                }
1345        }
1346        bh_lru_unlock();
1347        return ret;
1348}
1349
1350/*
1351 * Perform a pagecache lookup for the matching buffer.  If it's there, refresh
1352 * it in the LRU and mark it as accessed.  If it is not present then return
1353 * NULL
1354 */
1355struct buffer_head *
1356__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
1357{
1358        struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1359
1360        if (bh == NULL) {
1361                /* __find_get_block_slow will mark the page accessed */
1362                bh = __find_get_block_slow(bdev, block);
1363                if (bh)
1364                        bh_lru_install(bh);
1365        } else
1366                touch_buffer(bh);
1367
1368        return bh;
1369}
1370EXPORT_SYMBOL(__find_get_block);
1371
1372/*
1373 * __getblk_gfp() will locate (and, if necessary, create) the buffer_head
1374 * which corresponds to the passed block_device, block and size. The
1375 * returned buffer has its reference count incremented.
1376 *
1377 * __getblk_gfp() will lock up the machine if grow_dev_page's
1378 * try_to_free_buffers() attempt is failing.  FIXME, perhaps?
1379 */
1380struct buffer_head *
1381__getblk_gfp(struct block_device *bdev, sector_t block,
1382             unsigned size, gfp_t gfp)
1383{
1384        struct buffer_head *bh = __find_get_block(bdev, block, size);
1385
1386        might_sleep();
1387        if (bh == NULL)
1388                bh = __getblk_slow(bdev, block, size, gfp);
1389        return bh;
1390}
1391EXPORT_SYMBOL(__getblk_gfp);
1392
1393/*
1394 * Do async read-ahead on a buffer..
1395 */
1396void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
1397{
1398        struct buffer_head *bh = __getblk(bdev, block, size);
1399        if (likely(bh)) {
1400                ll_rw_block(READA, 1, &bh);
1401                brelse(bh);
1402        }
1403}
1404EXPORT_SYMBOL(__breadahead);
1405
1406/**
1407 *  __bread_gfp() - reads a specified block and returns the bh
1408 *  @bdev: the block_device to read from
1409 *  @block: number of block
1410 *  @size: size (in bytes) to read
1411 *  @gfp: page allocation flag
1412 *
1413 *  Reads a specified block, and returns buffer head that contains it.
1414 *  The page cache can be allocated from non-movable area
1415 *  not to prevent page migration if you set gfp to zero.
1416 *  It returns NULL if the block was unreadable.
1417 */
1418struct buffer_head *
1419__bread_gfp(struct block_device *bdev, sector_t block,
1420                   unsigned size, gfp_t gfp)
1421{
1422        struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp);
1423
1424        if (likely(bh) && !buffer_uptodate(bh))
1425                bh = __bread_slow(bh);
1426        return bh;
1427}
1428EXPORT_SYMBOL(__bread_gfp);
1429
1430/*
1431 * invalidate_bh_lrus() is called rarely - but not only at unmount.
1432 * This doesn't race because it runs in each cpu either in irq
1433 * or with preempt disabled.
1434 */
1435static void invalidate_bh_lru(void *arg)
1436{
1437        struct bh_lru *b = &get_cpu_var(bh_lrus);
1438        int i;
1439
1440        for (i = 0; i < BH_LRU_SIZE; i++) {
1441                brelse(b->bhs[i]);
1442                b->bhs[i] = NULL;
1443        }
1444        put_cpu_var(bh_lrus);
1445}
1446
1447static bool has_bh_in_lru(int cpu, void *dummy)
1448{
1449        struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu);
1450        int i;
1451        
1452        for (i = 0; i < BH_LRU_SIZE; i++) {
1453                if (b->bhs[i])
1454                        return 1;
1455        }
1456
1457        return 0;
1458}
1459
1460void invalidate_bh_lrus(void)
1461{
1462        on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1, GFP_KERNEL);
1463}
1464EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
1465
1466void set_bh_page(struct buffer_head *bh,
1467                struct page *page, unsigned long offset)
1468{
1469        bh->b_page = page;
1470        BUG_ON(offset >= PAGE_SIZE);
1471        if (PageHighMem(page))
1472                /*
1473                 * This catches illegal uses and preserves the offset:
1474                 */
1475                bh->b_data = (char *)(0 + offset);
1476        else
1477                bh->b_data = page_address(page) + offset;
1478}
1479EXPORT_SYMBOL(set_bh_page);
1480
1481/*
1482 * Called when truncating a buffer on a page completely.
1483 */
1484
1485/* Bits that are cleared during an invalidate */
1486#define BUFFER_FLAGS_DISCARD \
1487        (1 << BH_Mapped | 1 << BH_New | 1 << BH_Req | \
1488         1 << BH_Delay | 1 << BH_Unwritten)
1489
1490static void discard_buffer(struct buffer_head * bh)
1491{
1492        unsigned long b_state, b_state_old;
1493
1494        lock_buffer(bh);
1495        clear_buffer_dirty(bh);
1496        bh->b_bdev = NULL;
1497        b_state = bh->b_state;
1498        for (;;) {
1499                b_state_old = cmpxchg(&bh->b_state, b_state,
1500                                      (b_state & ~BUFFER_FLAGS_DISCARD));
1501                if (b_state_old == b_state)
1502                        break;
1503                b_state = b_state_old;
1504        }
1505        unlock_buffer(bh);
1506}
1507
1508/**
1509 * block_invalidatepage - invalidate part or all of a buffer-backed page
1510 *
1511 * @page: the page which is affected
1512 * @offset: start of the range to invalidate
1513 * @length: length of the range to invalidate
1514 *
1515 * block_invalidatepage() is called when all or part of the page has become
1516 * invalidated by a truncate operation.
1517 *
1518 * block_invalidatepage() does not have to release all buffers, but it must
1519 * ensure that no dirty buffer is left outside @offset and that no I/O
1520 * is underway against any of the blocks which are outside the truncation
1521 * point.  Because the caller is about to free (and possibly reuse) those
1522 * blocks on-disk.
1523 */
1524void block_invalidatepage(struct page *page, unsigned int offset,
1525                          unsigned int length)
1526{
1527        struct buffer_head *head, *bh, *next;
1528        unsigned int curr_off = 0;
1529        unsigned int stop = length + offset;
1530
1531        BUG_ON(!PageLocked(page));
1532        if (!page_has_buffers(page))
1533                goto out;
1534
1535        /*
1536         * Check for overflow
1537         */
1538        BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
1539
1540        head = page_buffers(page);
1541        bh = head;
1542        do {
1543                unsigned int next_off = curr_off + bh->b_size;
1544                next = bh->b_this_page;
1545
1546                /*
1547                 * Are we still fully in range ?
1548                 */
1549                if (next_off > stop)
1550                        goto out;
1551
1552                /*
1553                 * is this block fully invalidated?
1554                 */
1555                if (offset <= curr_off)
1556                        discard_buffer(bh);
1557                curr_off = next_off;
1558                bh = next;
1559        } while (bh != head);
1560
1561        /*
1562         * We release buffers only if the entire page is being invalidated.
1563         * The get_block cached value has been unconditionally invalidated,
1564         * so real IO is not possible anymore.
1565         */
1566        if (offset == 0)
1567                try_to_release_page(page, 0);
1568out:
1569        return;
1570}
1571EXPORT_SYMBOL(block_invalidatepage);
1572
1573
1574/*
1575 * We attach and possibly dirty the buffers atomically wrt
1576 * __set_page_dirty_buffers() via private_lock.  try_to_free_buffers
1577 * is already excluded via the page lock.
1578 */
1579void create_empty_buffers(struct page *page,
1580                        unsigned long blocksize, unsigned long b_state)
1581{
1582        struct buffer_head *bh, *head, *tail;
1583
1584        head = alloc_page_buffers(page, blocksize, 1);
1585        bh = head;
1586        do {
1587                bh->b_state |= b_state;
1588                tail = bh;
1589                bh = bh->b_this_page;
1590        } while (bh);
1591        tail->b_this_page = head;
1592
1593        spin_lock(&page->mapping->private_lock);
1594        if (PageUptodate(page) || PageDirty(page)) {
1595                bh = head;
1596                do {
1597                        if (PageDirty(page))
1598                                set_buffer_dirty(bh);
1599                        if (PageUptodate(page))
1600                                set_buffer_uptodate(bh);
1601                        bh = bh->b_this_page;
1602                } while (bh != head);
1603        }
1604        attach_page_buffers(page, head);
1605        spin_unlock(&page->mapping->private_lock);
1606}
1607EXPORT_SYMBOL(create_empty_buffers);
1608
1609/*
1610 * We are taking a block for data and we don't want any output from any
1611 * buffer-cache aliases starting from return from that function and
1612 * until the moment when something will explicitly mark the buffer
1613 * dirty (hopefully that will not happen until we will free that block ;-)
1614 * We don't even need to mark it not-uptodate - nobody can expect
1615 * anything from a newly allocated buffer anyway. We used to used
1616 * unmap_buffer() for such invalidation, but that was wrong. We definitely
1617 * don't want to mark the alias unmapped, for example - it would confuse
1618 * anyone who might pick it with bread() afterwards...
1619 *
1620 * Also..  Note that bforget() doesn't lock the buffer.  So there can
1621 * be writeout I/O going on against recently-freed buffers.  We don't
1622 * wait on that I/O in bforget() - it's more efficient to wait on the I/O
1623 * only if we really need to.  That happens here.
1624 */
1625void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
1626{
1627        struct buffer_head *old_bh;
1628
1629        might_sleep();
1630
1631        old_bh = __find_get_block_slow(bdev, block);
1632        if (old_bh) {
1633                clear_buffer_dirty(old_bh);
1634                wait_on_buffer(old_bh);
1635                clear_buffer_req(old_bh);
1636                __brelse(old_bh);
1637        }
1638}
1639EXPORT_SYMBOL(unmap_underlying_metadata);
1640
1641/*
1642 * Size is a power-of-two in the range 512..PAGE_SIZE,
1643 * and the case we care about most is PAGE_SIZE.
1644 *
1645 * So this *could* possibly be written with those
1646 * constraints in mind (relevant mostly if some
1647 * architecture has a slow bit-scan instruction)
1648 */
1649static inline int block_size_bits(unsigned int blocksize)
1650{
1651        return ilog2(blocksize);
1652}
1653
1654static struct buffer_head *create_page_buffers(struct page *page, struct inode *inode, unsigned int b_state)
1655{
1656        BUG_ON(!PageLocked(page));
1657
1658        if (!page_has_buffers(page))
1659                create_empty_buffers(page, 1 << ACCESS_ONCE(inode->i_blkbits), b_state);
1660        return page_buffers(page);
1661}
1662
1663/*
1664 * NOTE! All mapped/uptodate combinations are valid:
1665 *
1666 *      Mapped  Uptodate        Meaning
1667 *
1668 *      No      No              "unknown" - must do get_block()
1669 *      No      Yes             "hole" - zero-filled
1670 *      Yes     No              "allocated" - allocated on disk, not read in
1671 *      Yes     Yes             "valid" - allocated and up-to-date in memory.
1672 *
1673 * "Dirty" is valid only with the last case (mapped+uptodate).
1674 */
1675
1676/*
1677 * While block_write_full_page is writing back the dirty buffers under
1678 * the page lock, whoever dirtied the buffers may decide to clean them
1679 * again at any time.  We handle that by only looking at the buffer
1680 * state inside lock_buffer().
1681 *
1682 * If block_write_full_page() is called for regular writeback
1683 * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1684 * locked buffer.   This only can happen if someone has written the buffer
1685 * directly, with submit_bh().  At the address_space level PageWriteback
1686 * prevents this contention from occurring.
1687 *
1688 * If block_write_full_page() is called with wbc->sync_mode ==
1689 * WB_SYNC_ALL, the writes are posted using WRITE_SYNC; this
1690 * causes the writes to be flagged as synchronous writes.
1691 */
1692static int __block_write_full_page(struct inode *inode, struct page *page,
1693                        get_block_t *get_block, struct writeback_control *wbc,
1694                        bh_end_io_t *handler)
1695{
1696        int err;
1697        sector_t block;
1698        sector_t last_block;
1699        struct buffer_head *bh, *head;
1700        unsigned int blocksize, bbits;
1701        int nr_underway = 0;
1702        int write_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
1703
1704        head = create_page_buffers(page, inode,
1705                                        (1 << BH_Dirty)|(1 << BH_Uptodate));
1706
1707        /*
1708         * Be very careful.  We have no exclusion from __set_page_dirty_buffers
1709         * here, and the (potentially unmapped) buffers may become dirty at
1710         * any time.  If a buffer becomes dirty here after we've inspected it
1711         * then we just miss that fact, and the page stays dirty.
1712         *
1713         * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1714         * handle that here by just cleaning them.
1715         */
1716
1717        bh = head;
1718        blocksize = bh->b_size;
1719        bbits = block_size_bits(blocksize);
1720
1721        block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
1722        last_block = (i_size_read(inode) - 1) >> bbits;
1723
1724        /*
1725         * Get all the dirty buffers mapped to disk addresses and
1726         * handle any aliases from the underlying blockdev's mapping.
1727         */
1728        do {
1729                if (block > last_block) {
1730                        /*
1731                         * mapped buffers outside i_size will occur, because
1732                         * this page can be outside i_size when there is a
1733                         * truncate in progress.
1734                         */
1735                        /*
1736                         * The buffer was zeroed by block_write_full_page()
1737                         */
1738                        clear_buffer_dirty(bh);
1739                        set_buffer_uptodate(bh);
1740                } else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
1741                           buffer_dirty(bh)) {
1742                        WARN_ON(bh->b_size != blocksize);
1743                        err = get_block(inode, block, bh, 1);
1744                        if (err)
1745                                goto recover;
1746                        clear_buffer_delay(bh);
1747                        if (buffer_new(bh)) {
1748                                /* blockdev mappings never come here */
1749                                clear_buffer_new(bh);
1750                                unmap_underlying_metadata(bh->b_bdev,
1751                                                        bh->b_blocknr);
1752                        }
1753                }
1754                bh = bh->b_this_page;
1755                block++;
1756        } while (bh != head);
1757
1758        do {
1759                if (!buffer_mapped(bh))
1760                        continue;
1761                /*
1762                 * If it's a fully non-blocking write attempt and we cannot
1763                 * lock the buffer then redirty the page.  Note that this can
1764                 * potentially cause a busy-wait loop from writeback threads
1765                 * and kswapd activity, but those code paths have their own
1766                 * higher-level throttling.
1767                 */
1768                if (wbc->sync_mode != WB_SYNC_NONE) {
1769                        lock_buffer(bh);
1770                } else if (!trylock_buffer(bh)) {
1771                        redirty_page_for_writepage(wbc, page);
1772                        continue;
1773                }
1774                if (test_clear_buffer_dirty(bh)) {
1775                        mark_buffer_async_write_endio(bh, handler);
1776                } else {
1777                        unlock_buffer(bh);
1778                }
1779        } while ((bh = bh->b_this_page) != head);
1780
1781        /*
1782         * The page and its buffers are protected by PageWriteback(), so we can
1783         * drop the bh refcounts early.
1784         */
1785        BUG_ON(PageWriteback(page));
1786        set_page_writeback(page);
1787
1788        do {
1789                struct buffer_head *next = bh->b_this_page;
1790                if (buffer_async_write(bh)) {
1791                        submit_bh_wbc(write_op, bh, 0, wbc);
1792                        nr_underway++;
1793                }
1794                bh = next;
1795        } while (bh != head);
1796        unlock_page(page);
1797
1798        err = 0;
1799done:
1800        if (nr_underway == 0) {
1801                /*
1802                 * The page was marked dirty, but the buffers were
1803                 * clean.  Someone wrote them back by hand with
1804                 * ll_rw_block/submit_bh.  A rare case.
1805                 */
1806                end_page_writeback(page);
1807
1808                /*
1809                 * The page and buffer_heads can be released at any time from
1810                 * here on.
1811                 */
1812        }
1813        return err;
1814
1815recover:
1816        /*
1817         * ENOSPC, or some other error.  We may already have added some
1818         * blocks to the file, so we need to write these out to avoid
1819         * exposing stale data.
1820         * The page is currently locked and not marked for writeback
1821         */
1822        bh = head;
1823        /* Recovery: lock and submit the mapped buffers */
1824        do {
1825                if (buffer_mapped(bh) && buffer_dirty(bh) &&
1826                    !buffer_delay(bh)) {
1827                        lock_buffer(bh);
1828                        mark_buffer_async_write_endio(bh, handler);
1829                } else {
1830                        /*
1831                         * The buffer may have been set dirty during
1832                         * attachment to a dirty page.
1833                         */
1834                        clear_buffer_dirty(bh);
1835                }
1836        } while ((bh = bh->b_this_page) != head);
1837        SetPageError(page);
1838        BUG_ON(PageWriteback(page));
1839        mapping_set_error(page->mapping, err);
1840        set_page_writeback(page);
1841        do {
1842                struct buffer_head *next = bh->b_this_page;
1843                if (buffer_async_write(bh)) {
1844                        clear_buffer_dirty(bh);
1845                        submit_bh_wbc(write_op, bh, 0, wbc);
1846                        nr_underway++;
1847                }
1848                bh = next;
1849        } while (bh != head);
1850        unlock_page(page);
1851        goto done;
1852}
1853
1854/*
1855 * If a page has any new buffers, zero them out here, and mark them uptodate
1856 * and dirty so they'll be written out (in order to prevent uninitialised
1857 * block data from leaking). And clear the new bit.
1858 */
1859void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
1860{
1861        unsigned int block_start, block_end;
1862        struct buffer_head *head, *bh;
1863
1864        BUG_ON(!PageLocked(page));
1865        if (!page_has_buffers(page))
1866                return;
1867
1868        bh = head = page_buffers(page);
1869        block_start = 0;
1870        do {
1871                block_end = block_start + bh->b_size;
1872
1873                if (buffer_new(bh)) {
1874                        if (block_end > from && block_start < to) {
1875                                if (!PageUptodate(page)) {
1876                                        unsigned start, size;
1877
1878                                        start = max(from, block_start);
1879                                        size = min(to, block_end) - start;
1880
1881                                        zero_user(page, start, size);
1882                                        set_buffer_uptodate(bh);
1883                                }
1884
1885                                clear_buffer_new(bh);
1886                                mark_buffer_dirty(bh);
1887                        }
1888                }
1889
1890                block_start = block_end;
1891                bh = bh->b_this_page;
1892        } while (bh != head);
1893}
1894EXPORT_SYMBOL(page_zero_new_buffers);
1895
1896int __block_write_begin(struct page *page, loff_t pos, unsigned len,
1897                get_block_t *get_block)
1898{
1899        unsigned from = pos & (PAGE_CACHE_SIZE - 1);
1900        unsigned to = from + len;
1901        struct inode *inode = page->mapping->host;
1902        unsigned block_start, block_end;
1903        sector_t block;
1904        int err = 0;
1905        unsigned blocksize, bbits;
1906        struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1907
1908        BUG_ON(!PageLocked(page));
1909        BUG_ON(from > PAGE_CACHE_SIZE);
1910        BUG_ON(to > PAGE_CACHE_SIZE);
1911        BUG_ON(from > to);
1912
1913        head = create_page_buffers(page, inode, 0);
1914        blocksize = head->b_size;
1915        bbits = block_size_bits(blocksize);
1916
1917        block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
1918
1919        for(bh = head, block_start = 0; bh != head || !block_start;
1920            block++, block_start=block_end, bh = bh->b_this_page) {
1921                block_end = block_start + blocksize;
1922                if (block_end <= from || block_start >= to) {
1923                        if (PageUptodate(page)) {
1924                                if (!buffer_uptodate(bh))
1925                                        set_buffer_uptodate(bh);
1926                        }
1927                        continue;
1928                }
1929                if (buffer_new(bh))
1930                        clear_buffer_new(bh);
1931                if (!buffer_mapped(bh)) {
1932                        WARN_ON(bh->b_size != blocksize);
1933                        err = get_block(inode, block, bh, 1);
1934                        if (err)
1935                                break;
1936                        if (buffer_new(bh)) {
1937                                unmap_underlying_metadata(bh->b_bdev,
1938                                                        bh->b_blocknr);
1939                                if (PageUptodate(page)) {
1940                                        clear_buffer_new(bh);
1941                                        set_buffer_uptodate(bh);
1942                                        mark_buffer_dirty(bh);
1943                                        continue;
1944                                }
1945                                if (block_end > to || block_start < from)
1946                                        zero_user_segments(page,
1947                                                to, block_end,
1948                                                block_start, from);
1949                                continue;
1950                        }
1951                }
1952                if (PageUptodate(page)) {
1953                        if (!buffer_uptodate(bh))
1954                                set_buffer_uptodate(bh);
1955                        continue; 
1956                }
1957                if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
1958                    !buffer_unwritten(bh) &&
1959                     (block_start < from || block_end > to)) {
1960                        ll_rw_block(READ, 1, &bh);
1961                        *wait_bh++=bh;
1962                }
1963        }
1964        /*
1965         * If we issued read requests - let them complete.
1966         */
1967        while(wait_bh > wait) {
1968                wait_on_buffer(*--wait_bh);
1969                if (!buffer_uptodate(*wait_bh))
1970                        err = -EIO;
1971        }
1972        if (unlikely(err))
1973                page_zero_new_buffers(page, from, to);
1974        return err;
1975}
1976EXPORT_SYMBOL(__block_write_begin);
1977
1978static int __block_commit_write(struct inode *inode, struct page *page,
1979                unsigned from, unsigned to)
1980{
1981        unsigned block_start, block_end;
1982        int partial = 0;
1983        unsigned blocksize;
1984        struct buffer_head *bh, *head;
1985
1986        bh = head = page_buffers(page);
1987        blocksize = bh->b_size;
1988
1989        block_start = 0;
1990        do {
1991                block_end = block_start + blocksize;
1992                if (block_end <= from || block_start >= to) {
1993                        if (!buffer_uptodate(bh))
1994                                partial = 1;
1995                } else {
1996                        set_buffer_uptodate(bh);
1997                        mark_buffer_dirty(bh);
1998                }
1999                clear_buffer_new(bh);
2000
2001                block_start = block_end;
2002                bh = bh->b_this_page;
2003        } while (bh != head);
2004
2005        /*
2006         * If this is a partial write which happened to make all buffers
2007         * uptodate then we can optimize away a bogus readpage() for
2008         * the next read(). Here we 'discover' whether the page went
2009         * uptodate as a result of this (potentially partial) write.
2010         */
2011        if (!partial)
2012                SetPageUptodate(page);
2013        return 0;
2014}
2015
2016/*
2017 * block_write_begin takes care of the basic task of block allocation and
2018 * bringing partial write blocks uptodate first.
2019 *
2020 * The filesystem needs to handle block truncation upon failure.
2021 */
2022int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
2023                unsigned flags, struct page **pagep, get_block_t *get_block)
2024{
2025        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
2026        struct page *page;
2027        int status;
2028
2029        page = grab_cache_page_write_begin(mapping, index, flags);
2030        if (!page)
2031                return -ENOMEM;
2032
2033        status = __block_write_begin(page, pos, len, get_block);
2034        if (unlikely(status)) {
2035                unlock_page(page);
2036                page_cache_release(page);
2037                page = NULL;
2038        }
2039
2040        *pagep = page;
2041        return status;
2042}
2043EXPORT_SYMBOL(block_write_begin);
2044
2045int block_write_end(struct file *file, struct address_space *mapping,
2046                        loff_t pos, unsigned len, unsigned copied,
2047                        struct page *page, void *fsdata)
2048{
2049        struct inode *inode = mapping->host;
2050        unsigned start;
2051
2052        start = pos & (PAGE_CACHE_SIZE - 1);
2053
2054        if (unlikely(copied < len)) {
2055                /*
2056                 * The buffers that were written will now be uptodate, so we
2057                 * don't have to worry about a readpage reading them and
2058                 * overwriting a partial write. However if we have encountered
2059                 * a short write and only partially written into a buffer, it
2060                 * will not be marked uptodate, so a readpage might come in and
2061                 * destroy our partial write.
2062                 *
2063                 * Do the simplest thing, and just treat any short write to a
2064                 * non uptodate page as a zero-length write, and force the
2065                 * caller to redo the whole thing.
2066                 */
2067                if (!PageUptodate(page))
2068                        copied = 0;
2069
2070                page_zero_new_buffers(page, start+copied, start+len);
2071        }
2072        flush_dcache_page(page);
2073
2074        /* This could be a short (even 0-length) commit */
2075        __block_commit_write(inode, page, start, start+copied);
2076
2077        return copied;
2078}
2079EXPORT_SYMBOL(block_write_end);
2080
2081int generic_write_end(struct file *file, struct address_space *mapping,
2082                        loff_t pos, unsigned len, unsigned copied,
2083                        struct page *page, void *fsdata)
2084{
2085        struct inode *inode = mapping->host;
2086        loff_t old_size = inode->i_size;
2087        int i_size_changed = 0;
2088
2089        copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
2090
2091        /*
2092         * No need to use i_size_read() here, the i_size
2093         * cannot change under us because we hold i_mutex.
2094         *
2095         * But it's important to update i_size while still holding page lock:
2096         * page writeout could otherwise come in and zero beyond i_size.
2097         */
2098        if (pos+copied > inode->i_size) {
2099                i_size_write(inode, pos+copied);
2100                i_size_changed = 1;
2101        }
2102
2103        unlock_page(page);
2104        page_cache_release(page);
2105
2106        if (old_size < pos)
2107                pagecache_isize_extended(inode, old_size, pos);
2108        /*
2109         * Don't mark the inode dirty under page lock. First, it unnecessarily
2110         * makes the holding time of page lock longer. Second, it forces lock
2111         * ordering of page lock and transaction start for journaling
2112         * filesystems.
2113         */
2114        if (i_size_changed)
2115                mark_inode_dirty(inode);
2116
2117        return copied;
2118}
2119EXPORT_SYMBOL(generic_write_end);
2120
2121/*
2122 * block_is_partially_uptodate checks whether buffers within a page are
2123 * uptodate or not.
2124 *
2125 * Returns true if all buffers which correspond to a file portion
2126 * we want to read are uptodate.
2127 */
2128int block_is_partially_uptodate(struct page *page, unsigned long from,
2129                                        unsigned long count)
2130{
2131        unsigned block_start, block_end, blocksize;
2132        unsigned to;
2133        struct buffer_head *bh, *head;
2134        int ret = 1;
2135
2136        if (!page_has_buffers(page))
2137                return 0;
2138
2139        head = page_buffers(page);
2140        blocksize = head->b_size;
2141        to = min_t(unsigned, PAGE_CACHE_SIZE - from, count);
2142        to = from + to;
2143        if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
2144                return 0;
2145
2146        bh = head;
2147        block_start = 0;
2148        do {
2149                block_end = block_start + blocksize;
2150                if (block_end > from && block_start < to) {
2151                        if (!buffer_uptodate(bh)) {
2152                                ret = 0;
2153                                break;
2154                        }
2155                        if (block_end >= to)
2156                                break;
2157                }
2158                block_start = block_end;
2159                bh = bh->b_this_page;
2160        } while (bh != head);
2161
2162        return ret;
2163}
2164EXPORT_SYMBOL(block_is_partially_uptodate);
2165
2166/*
2167 * Generic "read page" function for block devices that have the normal
2168 * get_block functionality. This is most of the block device filesystems.
2169 * Reads the page asynchronously --- the unlock_buffer() and
2170 * set/clear_buffer_uptodate() functions propagate buffer state into the
2171 * page struct once IO has completed.
2172 */
2173int block_read_full_page(struct page *page, get_block_t *get_block)
2174{
2175        struct inode *inode = page->mapping->host;
2176        sector_t iblock, lblock;
2177        struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
2178        unsigned int blocksize, bbits;
2179        int nr, i;
2180        int fully_mapped = 1;
2181
2182        head = create_page_buffers(page, inode, 0);
2183        blocksize = head->b_size;
2184        bbits = block_size_bits(blocksize);
2185
2186        iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
2187        lblock = (i_size_read(inode)+blocksize-1) >> bbits;
2188        bh = head;
2189        nr = 0;
2190        i = 0;
2191
2192        do {
2193                if (buffer_uptodate(bh))
2194                        continue;
2195
2196                if (!buffer_mapped(bh)) {
2197                        int err = 0;
2198
2199                        fully_mapped = 0;
2200                        if (iblock < lblock) {
2201                                WARN_ON(bh->b_size != blocksize);
2202                                err = get_block(inode, iblock, bh, 0);
2203                                if (err)
2204                                        SetPageError(page);
2205                        }
2206                        if (!buffer_mapped(bh)) {
2207                                zero_user(page, i * blocksize, blocksize);
2208                                if (!err)
2209                                        set_buffer_uptodate(bh);
2210                                continue;
2211                        }
2212                        /*
2213                         * get_block() might have updated the buffer
2214                         * synchronously
2215                         */
2216                        if (buffer_uptodate(bh))
2217                                continue;
2218                }
2219                arr[nr++] = bh;
2220        } while (i++, iblock++, (bh = bh->b_this_page) != head);
2221
2222        if (fully_mapped)
2223                SetPageMappedToDisk(page);
2224
2225        if (!nr) {
2226                /*
2227                 * All buffers are uptodate - we can set the page uptodate
2228                 * as well. But not if get_block() returned an error.
2229                 */
2230                if (!PageError(page))
2231                        SetPageUptodate(page);
2232                unlock_page(page);
2233                return 0;
2234        }
2235
2236        /* Stage two: lock the buffers */
2237        for (i = 0; i < nr; i++) {
2238                bh = arr[i];
2239                lock_buffer(bh);
2240                mark_buffer_async_read(bh);
2241        }
2242
2243        /*
2244         * Stage 3: start the IO.  Check for uptodateness
2245         * inside the buffer lock in case another process reading
2246         * the underlying blockdev brought it uptodate (the sct fix).
2247         */
2248        for (i = 0; i < nr; i++) {
2249                bh = arr[i];
2250                if (buffer_uptodate(bh))
2251                        end_buffer_async_read(bh, 1);
2252                else
2253                        submit_bh(READ, bh);
2254        }
2255        return 0;
2256}
2257EXPORT_SYMBOL(block_read_full_page);
2258
2259/* utility function for filesystems that need to do work on expanding
2260 * truncates.  Uses filesystem pagecache writes to allow the filesystem to
2261 * deal with the hole.  
2262 */
2263int generic_cont_expand_simple(struct inode *inode, loff_t size)
2264{
2265        struct address_space *mapping = inode->i_mapping;
2266        struct page *page;
2267        void *fsdata;
2268        int err;
2269
2270        err = inode_newsize_ok(inode, size);
2271        if (err)
2272                goto out;
2273
2274        err = pagecache_write_begin(NULL, mapping, size, 0,
2275                                AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND,
2276                                &page, &fsdata);
2277        if (err)
2278                goto out;
2279
2280        err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata);
2281        BUG_ON(err > 0);
2282
2283out:
2284        return err;
2285}
2286EXPORT_SYMBOL(generic_cont_expand_simple);
2287
2288static int cont_expand_zero(struct file *file, struct address_space *mapping,
2289                            loff_t pos, loff_t *bytes)
2290{
2291        struct inode *inode = mapping->host;
2292        unsigned blocksize = 1 << inode->i_blkbits;
2293        struct page *page;
2294        void *fsdata;
2295        pgoff_t index, curidx;
2296        loff_t curpos;
2297        unsigned zerofrom, offset, len;
2298        int err = 0;
2299
2300        index = pos >> PAGE_CACHE_SHIFT;
2301        offset = pos & ~PAGE_CACHE_MASK;
2302
2303        while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) {
2304                zerofrom = curpos & ~PAGE_CACHE_MASK;
2305                if (zerofrom & (blocksize-1)) {
2306                        *bytes |= (blocksize-1);
2307                        (*bytes)++;
2308                }
2309                len = PAGE_CACHE_SIZE - zerofrom;
2310
2311                err = pagecache_write_begin(file, mapping, curpos, len,
2312                                                AOP_FLAG_UNINTERRUPTIBLE,
2313                                                &page, &fsdata);
2314                if (err)
2315                        goto out;
2316                zero_user(page, zerofrom, len);
2317                err = pagecache_write_end(file, mapping, curpos, len, len,
2318                                                page, fsdata);
2319                if (err < 0)
2320                        goto out;
2321                BUG_ON(err != len);
2322                err = 0;
2323
2324                balance_dirty_pages_ratelimited(mapping);
2325
2326                if (unlikely(fatal_signal_pending(current))) {
2327                        err = -EINTR;
2328                        goto out;
2329                }
2330        }
2331
2332        /* page covers the boundary, find the boundary offset */
2333        if (index == curidx) {
2334                zerofrom = curpos & ~PAGE_CACHE_MASK;
2335                /* if we will expand the thing last block will be filled */
2336                if (offset <= zerofrom) {
2337                        goto out;
2338                }
2339                if (zerofrom & (blocksize-1)) {
2340                        *bytes |= (blocksize-1);
2341                        (*bytes)++;
2342                }
2343                len = offset - zerofrom;
2344
2345                err = pagecache_write_begin(file, mapping, curpos, len,
2346                                                AOP_FLAG_UNINTERRUPTIBLE,
2347                                                &page, &fsdata);
2348                if (err)
2349                        goto out;
2350                zero_user(page, zerofrom, len);
2351                err = pagecache_write_end(file, mapping, curpos, len, len,
2352                                                page, fsdata);
2353                if (err < 0)
2354                        goto out;
2355                BUG_ON(err != len);
2356                err = 0;
2357        }
2358out:
2359        return err;
2360}
2361
2362/*
2363 * For moronic filesystems that do not allow holes in file.
2364 * We may have to extend the file.
2365 */
2366int cont_write_begin(struct file *file, struct address_space *mapping,
2367                        loff_t pos, unsigned len, unsigned flags,
2368                        struct page **pagep, void **fsdata,
2369                        get_block_t *get_block, loff_t *bytes)
2370{
2371        struct inode *inode = mapping->host;
2372        unsigned blocksize = 1 << inode->i_blkbits;
2373        unsigned zerofrom;
2374        int err;
2375
2376        err = cont_expand_zero(file, mapping, pos, bytes);
2377        if (err)
2378                return err;
2379
2380        zerofrom = *bytes & ~PAGE_CACHE_MASK;
2381        if (pos+len > *bytes && zerofrom & (blocksize-1)) {
2382                *bytes |= (blocksize-1);
2383                (*bytes)++;
2384        }
2385
2386        return block_write_begin(mapping, pos, len, flags, pagep, get_block);
2387}
2388EXPORT_SYMBOL(cont_write_begin);
2389
2390int block_commit_write(struct page *page, unsigned from, unsigned to)
2391{
2392        struct inode *inode = page->mapping->host;
2393        __block_commit_write(inode,page,from,to);
2394        return 0;
2395}
2396EXPORT_SYMBOL(block_commit_write);
2397
2398/*
2399 * block_page_mkwrite() is not allowed to change the file size as it gets
2400 * called from a page fault handler when a page is first dirtied. Hence we must
2401 * be careful to check for EOF conditions here. We set the page up correctly
2402 * for a written page which means we get ENOSPC checking when writing into
2403 * holes and correct delalloc and unwritten extent mapping on filesystems that
2404 * support these features.
2405 *
2406 * We are not allowed to take the i_mutex here so we have to play games to
2407 * protect against truncate races as the page could now be beyond EOF.  Because
2408 * truncate writes the inode size before removing pages, once we have the
2409 * page lock we can determine safely if the page is beyond EOF. If it is not
2410 * beyond EOF, then the page is guaranteed safe against truncation until we
2411 * unlock the page.
2412 *
2413 * Direct callers of this function should protect against filesystem freezing
2414 * using sb_start_pagefault() - sb_end_pagefault() functions.
2415 */
2416int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2417                         get_block_t get_block)
2418{
2419        struct page *page = vmf->page;
2420        struct inode *inode = file_inode(vma->vm_file);
2421        unsigned long end;
2422        loff_t size;
2423        int ret;
2424
2425        lock_page(page);
2426        size = i_size_read(inode);
2427        if ((page->mapping != inode->i_mapping) ||
2428            (page_offset(page) > size)) {
2429                /* We overload EFAULT to mean page got truncated */
2430                ret = -EFAULT;
2431                goto out_unlock;
2432        }
2433
2434        /* page is wholly or partially inside EOF */
2435        if (((page->index + 1) << PAGE_CACHE_SHIFT) > size)
2436                end = size & ~PAGE_CACHE_MASK;
2437        else
2438                end = PAGE_CACHE_SIZE;
2439
2440        ret = __block_write_begin(page, 0, end, get_block);
2441        if (!ret)
2442                ret = block_commit_write(page, 0, end);
2443
2444        if (unlikely(ret < 0))
2445                goto out_unlock;
2446        set_page_dirty(page);
2447        wait_for_stable_page(page);
2448        return 0;
2449out_unlock:
2450        unlock_page(page);
2451        return ret;
2452}
2453EXPORT_SYMBOL(block_page_mkwrite);
2454
2455/*
2456 * nobh_write_begin()'s prereads are special: the buffer_heads are freed
2457 * immediately, while under the page lock.  So it needs a special end_io
2458 * handler which does not touch the bh after unlocking it.
2459 */
2460static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
2461{
2462        __end_buffer_read_notouch(bh, uptodate);
2463}
2464
2465/*
2466 * Attach the singly-linked list of buffers created by nobh_write_begin, to
2467 * the page (converting it to circular linked list and taking care of page
2468 * dirty races).
2469 */
2470static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
2471{
2472        struct buffer_head *bh;
2473
2474        BUG_ON(!PageLocked(page));
2475
2476        spin_lock(&page->mapping->private_lock);
2477        bh = head;
2478        do {
2479                if (PageDirty(page))
2480                        set_buffer_dirty(bh);
2481                if (!bh->b_this_page)
2482                        bh->b_this_page = head;
2483                bh = bh->b_this_page;
2484        } while (bh != head);
2485        attach_page_buffers(page, head);
2486        spin_unlock(&page->mapping->private_lock);
2487}
2488
2489/*
2490 * On entry, the page is fully not uptodate.
2491 * On exit the page is fully uptodate in the areas outside (from,to)
2492 * The filesystem needs to handle block truncation upon failure.
2493 */
2494int nobh_write_begin(struct address_space *mapping,
2495                        loff_t pos, unsigned len, unsigned flags,
2496                        struct page **pagep, void **fsdata,
2497                        get_block_t *get_block)
2498{
2499        struct inode *inode = mapping->host;
2500        const unsigned blkbits = inode->i_blkbits;
2501        const unsigned blocksize = 1 << blkbits;
2502        struct buffer_head *head, *bh;
2503        struct page *page;
2504        pgoff_t index;
2505        unsigned from, to;
2506        unsigned block_in_page;
2507        unsigned block_start, block_end;
2508        sector_t block_in_file;
2509        int nr_reads = 0;
2510        int ret = 0;
2511        int is_mapped_to_disk = 1;
2512
2513        index = pos >> PAGE_CACHE_SHIFT;
2514        from = pos & (PAGE_CACHE_SIZE - 1);
2515        to = from + len;
2516
2517        page = grab_cache_page_write_begin(mapping, index, flags);
2518        if (!page)
2519                return -ENOMEM;
2520        *pagep = page;
2521        *fsdata = NULL;
2522
2523        if (page_has_buffers(page)) {
2524                ret = __block_write_begin(page, pos, len, get_block);
2525                if (unlikely(ret))
2526                        goto out_release;
2527                return ret;
2528        }
2529
2530        if (PageMappedToDisk(page))
2531                return 0;
2532
2533        /*
2534         * Allocate buffers so that we can keep track of state, and potentially
2535         * attach them to the page if an error occurs. In the common case of
2536         * no error, they will just be freed again without ever being attached
2537         * to the page (which is all OK, because we're under the page lock).
2538         *
2539         * Be careful: the buffer linked list is a NULL terminated one, rather
2540         * than the circular one we're used to.
2541         */
2542        head = alloc_page_buffers(page, blocksize, 0);
2543        if (!head) {
2544                ret = -ENOMEM;
2545                goto out_release;
2546        }
2547
2548        block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
2549
2550        /*
2551         * We loop across all blocks in the page, whether or not they are
2552         * part of the affected region.  This is so we can discover if the
2553         * page is fully mapped-to-disk.
2554         */
2555        for (block_start = 0, block_in_page = 0, bh = head;
2556                  block_start < PAGE_CACHE_SIZE;
2557                  block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
2558                int create;
2559
2560                block_end = block_start + blocksize;
2561                bh->b_state = 0;
2562                create = 1;
2563                if (block_start >= to)
2564                        create = 0;
2565                ret = get_block(inode, block_in_file + block_in_page,
2566                                        bh, create);
2567                if (ret)
2568                        goto failed;
2569                if (!buffer_mapped(bh))
2570                        is_mapped_to_disk = 0;
2571                if (buffer_new(bh))
2572                        unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
2573                if (PageUptodate(page)) {
2574                        set_buffer_uptodate(bh);
2575                        continue;
2576                }
2577                if (buffer_new(bh) || !buffer_mapped(bh)) {
2578                        zero_user_segments(page, block_start, from,
2579                                                        to, block_end);
2580                        continue;
2581                }
2582                if (buffer_uptodate(bh))
2583                        continue;       /* reiserfs does this */
2584                if (block_start < from || block_end > to) {
2585                        lock_buffer(bh);
2586                        bh->b_end_io = end_buffer_read_nobh;
2587                        submit_bh(READ, bh);
2588                        nr_reads++;
2589                }
2590        }
2591
2592        if (nr_reads) {
2593                /*
2594                 * The page is locked, so these buffers are protected from
2595                 * any VM or truncate activity.  Hence we don't need to care
2596                 * for the buffer_head refcounts.
2597                 */
2598                for (bh = head; bh; bh = bh->b_this_page) {
2599                        wait_on_buffer(bh);
2600                        if (!buffer_uptodate(bh))
2601                                ret = -EIO;
2602                }
2603                if (ret)
2604                        goto failed;
2605        }
2606
2607        if (is_mapped_to_disk)
2608                SetPageMappedToDisk(page);
2609
2610        *fsdata = head; /* to be released by nobh_write_end */
2611
2612        return 0;
2613
2614failed:
2615        BUG_ON(!ret);
2616        /*
2617         * Error recovery is a bit difficult. We need to zero out blocks that
2618         * were newly allocated, and dirty them to ensure they get written out.
2619         * Buffers need to be attached to the page at this point, otherwise
2620         * the handling of potential IO errors during writeout would be hard
2621         * (could try doing synchronous writeout, but what if that fails too?)
2622         */
2623        attach_nobh_buffers(page, head);
2624        page_zero_new_buffers(page, from, to);
2625
2626out_release:
2627        unlock_page(page);
2628        page_cache_release(page);
2629        *pagep = NULL;
2630
2631        return ret;
2632}
2633EXPORT_SYMBOL(nobh_write_begin);
2634
2635int nobh_write_end(struct file *file, struct address_space *mapping,
2636                        loff_t pos, unsigned len, unsigned copied,
2637                        struct page *page, void *fsdata)
2638{
2639        struct inode *inode = page->mapping->host;
2640        struct buffer_head *head = fsdata;
2641        struct buffer_head *bh;
2642        BUG_ON(fsdata != NULL && page_has_buffers(page));
2643
2644        if (unlikely(copied < len) && head)
2645                attach_nobh_buffers(page, head);
2646        if (page_has_buffers(page))
2647                return generic_write_end(file, mapping, pos, len,
2648                                        copied, page, fsdata);
2649
2650        SetPageUptodate(page);
2651        set_page_dirty(page);
2652        if (pos+copied > inode->i_size) {
2653                i_size_write(inode, pos+copied);
2654                mark_inode_dirty(inode);
2655        }
2656
2657        unlock_page(page);
2658        page_cache_release(page);
2659
2660        while (head) {
2661                bh = head;
2662                head = head->b_this_page;
2663                free_buffer_head(bh);
2664        }
2665
2666        return copied;
2667}
2668EXPORT_SYMBOL(nobh_write_end);
2669
2670/*
2671 * nobh_writepage() - based on block_full_write_page() except
2672 * that it tries to operate without attaching bufferheads to
2673 * the page.
2674 */
2675int nobh_writepage(struct page *page, get_block_t *get_block,
2676                        struct writeback_control *wbc)
2677{
2678        struct inode * const inode = page->mapping->host;
2679        loff_t i_size = i_size_read(inode);
2680        const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2681        unsigned offset;
2682        int ret;
2683
2684        /* Is the page fully inside i_size? */
2685        if (page->index < end_index)
2686                goto out;
2687
2688        /* Is the page fully outside i_size? (truncate in progress) */
2689        offset = i_size & (PAGE_CACHE_SIZE-1);
2690        if (page->index >= end_index+1 || !offset) {
2691                /*
2692                 * The page may have dirty, unmapped buffers.  For example,
2693                 * they may have been added in ext3_writepage().  Make them
2694                 * freeable here, so the page does not leak.
2695                 */
2696#if 0
2697                /* Not really sure about this  - do we need this ? */
2698                if (page->mapping->a_ops->invalidatepage)
2699                        page->mapping->a_ops->invalidatepage(page, offset);
2700#endif
2701                unlock_page(page);
2702                return 0; /* don't care */
2703        }
2704
2705        /*
2706         * The page straddles i_size.  It must be zeroed out on each and every
2707         * writepage invocation because it may be mmapped.  "A file is mapped
2708         * in multiples of the page size.  For a file that is not a multiple of
2709         * the  page size, the remaining memory is zeroed when mapped, and
2710         * writes to that region are not written out to the file."
2711         */
2712        zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2713out:
2714        ret = mpage_writepage(page, get_block, wbc);
2715        if (ret == -EAGAIN)
2716                ret = __block_write_full_page(inode, page, get_block, wbc,
2717                                              end_buffer_async_write);
2718        return ret;
2719}
2720EXPORT_SYMBOL(nobh_writepage);
2721
2722int nobh_truncate_page(struct address_space *mapping,
2723                        loff_t from, get_block_t *get_block)
2724{
2725        pgoff_t index = from >> PAGE_CACHE_SHIFT;
2726        unsigned offset = from & (PAGE_CACHE_SIZE-1);
2727        unsigned blocksize;
2728        sector_t iblock;
2729        unsigned length, pos;
2730        struct inode *inode = mapping->host;
2731        struct page *page;
2732        struct buffer_head map_bh;
2733        int err;
2734
2735        blocksize = 1 << inode->i_blkbits;
2736        length = offset & (blocksize - 1);
2737
2738        /* Block boundary? Nothing to do */
2739        if (!length)
2740                return 0;
2741
2742        length = blocksize - length;
2743        iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2744
2745        page = grab_cache_page(mapping, index);
2746        err = -ENOMEM;
2747        if (!page)
2748                goto out;
2749
2750        if (page_has_buffers(page)) {
2751has_buffers:
2752                unlock_page(page);
2753                page_cache_release(page);
2754                return block_truncate_page(mapping, from, get_block);
2755        }
2756
2757        /* Find the buffer that contains "offset" */
2758        pos = blocksize;
2759        while (offset >= pos) {
2760                iblock++;
2761                pos += blocksize;
2762        }
2763
2764        map_bh.b_size = blocksize;
2765        map_bh.b_state = 0;
2766        err = get_block(inode, iblock, &map_bh, 0);
2767        if (err)
2768                goto unlock;
2769        /* unmapped? It's a hole - nothing to do */
2770        if (!buffer_mapped(&map_bh))
2771                goto unlock;
2772
2773        /* Ok, it's mapped. Make sure it's up-to-date */
2774        if (!PageUptodate(page)) {
2775                err = mapping->a_ops->readpage(NULL, page);
2776                if (err) {
2777                        page_cache_release(page);
2778                        goto out;
2779                }
2780                lock_page(page);
2781                if (!PageUptodate(page)) {
2782                        err = -EIO;
2783                        goto unlock;
2784                }
2785                if (page_has_buffers(page))
2786                        goto has_buffers;
2787        }
2788        zero_user(page, offset, length);
2789        set_page_dirty(page);
2790        err = 0;
2791
2792unlock:
2793        unlock_page(page);
2794        page_cache_release(page);
2795out:
2796        return err;
2797}
2798EXPORT_SYMBOL(nobh_truncate_page);
2799
2800int block_truncate_page(struct address_space *mapping,
2801                        loff_t from, get_block_t *get_block)
2802{
2803        pgoff_t index = from >> PAGE_CACHE_SHIFT;
2804        unsigned offset = from & (PAGE_CACHE_SIZE-1);
2805        unsigned blocksize;
2806        sector_t iblock;
2807        unsigned length, pos;
2808        struct inode *inode = mapping->host;
2809        struct page *page;
2810        struct buffer_head *bh;
2811        int err;
2812
2813        blocksize = 1 << inode->i_blkbits;
2814        length = offset & (blocksize - 1);
2815
2816        /* Block boundary? Nothing to do */
2817        if (!length)
2818                return 0;
2819
2820        length = blocksize - length;
2821        iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2822        
2823        page = grab_cache_page(mapping, index);
2824        err = -ENOMEM;
2825        if (!page)
2826                goto out;
2827
2828        if (!page_has_buffers(page))
2829                create_empty_buffers(page, blocksize, 0);
2830
2831        /* Find the buffer that contains "offset" */
2832        bh = page_buffers(page);
2833        pos = blocksize;
2834        while (offset >= pos) {
2835                bh = bh->b_this_page;
2836                iblock++;
2837                pos += blocksize;
2838        }
2839
2840        err = 0;
2841        if (!buffer_mapped(bh)) {
2842                WARN_ON(bh->b_size != blocksize);
2843                err = get_block(inode, iblock, bh, 0);
2844                if (err)
2845                        goto unlock;
2846                /* unmapped? It's a hole - nothing to do */
2847                if (!buffer_mapped(bh))
2848                        goto unlock;
2849        }
2850
2851        /* Ok, it's mapped. Make sure it's up-to-date */
2852        if (PageUptodate(page))
2853                set_buffer_uptodate(bh);
2854
2855        if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
2856                err = -EIO;
2857                ll_rw_block(READ, 1, &bh);
2858                wait_on_buffer(bh);
2859                /* Uhhuh. Read error. Complain and punt. */
2860                if (!buffer_uptodate(bh))
2861                        goto unlock;
2862        }
2863
2864        zero_user(page, offset, length);
2865        mark_buffer_dirty(bh);
2866        err = 0;
2867
2868unlock:
2869        unlock_page(page);
2870        page_cache_release(page);
2871out:
2872        return err;
2873}
2874EXPORT_SYMBOL(block_truncate_page);
2875
2876/*
2877 * The generic ->writepage function for buffer-backed address_spaces
2878 */
2879int block_write_full_page(struct page *page, get_block_t *get_block,
2880                        struct writeback_control *wbc)
2881{
2882        struct inode * const inode = page->mapping->host;
2883        loff_t i_size = i_size_read(inode);
2884        const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2885        unsigned offset;
2886
2887        /* Is the page fully inside i_size? */
2888        if (page->index < end_index)
2889                return __block_write_full_page(inode, page, get_block, wbc,
2890                                               end_buffer_async_write);
2891
2892        /* Is the page fully outside i_size? (truncate in progress) */
2893        offset = i_size & (PAGE_CACHE_SIZE-1);
2894        if (page->index >= end_index+1 || !offset) {
2895                /*
2896                 * The page may have dirty, unmapped buffers.  For example,
2897                 * they may have been added in ext3_writepage().  Make them
2898                 * freeable here, so the page does not leak.
2899                 */
2900                do_invalidatepage(page, 0, PAGE_CACHE_SIZE);
2901                unlock_page(page);
2902                return 0; /* don't care */
2903        }
2904
2905        /*
2906         * The page straddles i_size.  It must be zeroed out on each and every
2907         * writepage invocation because it may be mmapped.  "A file is mapped
2908         * in multiples of the page size.  For a file that is not a multiple of
2909         * the  page size, the remaining memory is zeroed when mapped, and
2910         * writes to that region are not written out to the file."
2911         */
2912        zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2913        return __block_write_full_page(inode, page, get_block, wbc,
2914                                                        end_buffer_async_write);
2915}
2916EXPORT_SYMBOL(block_write_full_page);
2917
2918sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2919                            get_block_t *get_block)
2920{
2921        struct buffer_head tmp;
2922        struct inode *inode = mapping->host;
2923        tmp.b_state = 0;
2924        tmp.b_blocknr = 0;
2925        tmp.b_size = 1 << inode->i_blkbits;
2926        get_block(inode, block, &tmp, 0);
2927        return tmp.b_blocknr;
2928}
2929EXPORT_SYMBOL(generic_block_bmap);
2930
2931static void end_bio_bh_io_sync(struct bio *bio)
2932{
2933        struct buffer_head *bh = bio->bi_private;
2934
2935        if (unlikely(bio_flagged(bio, BIO_QUIET)))
2936                set_bit(BH_Quiet, &bh->b_state);
2937
2938        bh->b_end_io(bh, !bio->bi_error);
2939        bio_put(bio);
2940}
2941
2942/*
2943 * This allows us to do IO even on the odd last sectors
2944 * of a device, even if the block size is some multiple
2945 * of the physical sector size.
2946 *
2947 * We'll just truncate the bio to the size of the device,
2948 * and clear the end of the buffer head manually.
2949 *
2950 * Truly out-of-range accesses will turn into actual IO
2951 * errors, this only handles the "we need to be able to
2952 * do IO at the final sector" case.
2953 */
2954void guard_bio_eod(int rw, struct bio *bio)
2955{
2956        sector_t maxsector;
2957        struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
2958        unsigned truncated_bytes;
2959
2960        maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9;
2961        if (!maxsector)
2962                return;
2963
2964        /*
2965         * If the *whole* IO is past the end of the device,
2966         * let it through, and the IO layer will turn it into
2967         * an EIO.
2968         */
2969        if (unlikely(bio->bi_iter.bi_sector >= maxsector))
2970                return;
2971
2972        maxsector -= bio->bi_iter.bi_sector;
2973        if (likely((bio->bi_iter.bi_size >> 9) <= maxsector))
2974                return;
2975
2976        /* Uhhuh. We've got a bio that straddles the device size! */
2977        truncated_bytes = bio->bi_iter.bi_size - (maxsector << 9);
2978
2979        /* Truncate the bio.. */
2980        bio->bi_iter.bi_size -= truncated_bytes;
2981        bvec->bv_len -= truncated_bytes;
2982
2983        /* ..and clear the end of the buffer for reads */
2984        if ((rw & RW_MASK) == READ) {
2985                zero_user(bvec->bv_page, bvec->bv_offset + bvec->bv_len,
2986                                truncated_bytes);
2987        }
2988}
2989
2990static int submit_bh_wbc(int rw, struct buffer_head *bh,
2991                         unsigned long bio_flags, struct writeback_control *wbc)
2992{
2993        struct bio *bio;
2994
2995        BUG_ON(!buffer_locked(bh));
2996        BUG_ON(!buffer_mapped(bh));
2997        BUG_ON(!bh->b_end_io);
2998        BUG_ON(buffer_delay(bh));
2999        BUG_ON(buffer_unwritten(bh));
3000
3001        /*
3002         * Only clear out a write error when rewriting
3003         */
3004        if (test_set_buffer_req(bh) && (rw & WRITE))
3005                clear_buffer_write_io_error(bh);
3006
3007        /*
3008         * from here on down, it's all bio -- do the initial mapping,
3009         * submit_bio -> generic_make_request may further map this bio around
3010         */
3011        bio = bio_alloc(GFP_NOIO, 1);
3012
3013        if (wbc) {
3014                wbc_init_bio(wbc, bio);
3015                wbc_account_io(wbc, bh->b_page, bh->b_size);
3016        }
3017
3018        bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
3019        bio->bi_bdev = bh->b_bdev;
3020
3021        bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
3022        BUG_ON(bio->bi_iter.bi_size != bh->b_size);
3023
3024        bio->bi_end_io = end_bio_bh_io_sync;
3025        bio->bi_private = bh;
3026        bio->bi_flags |= bio_flags;
3027
3028        /* Take care of bh's that straddle the end of the device */
3029        guard_bio_eod(rw, bio);
3030
3031        if (buffer_meta(bh))
3032                rw |= REQ_META;
3033        if (buffer_prio(bh))
3034                rw |= REQ_PRIO;
3035
3036        submit_bio(rw, bio);
3037        return 0;
3038}
3039
3040int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
3041{
3042        return submit_bh_wbc(rw, bh, bio_flags, NULL);
3043}
3044EXPORT_SYMBOL_GPL(_submit_bh);
3045
3046int submit_bh(int rw, struct buffer_head *bh)
3047{
3048        return submit_bh_wbc(rw, bh, 0, NULL);
3049}
3050EXPORT_SYMBOL(submit_bh);
3051
3052/**
3053 * ll_rw_block: low-level access to block devices (DEPRECATED)
3054 * @rw: whether to %READ or %WRITE or maybe %READA (readahead)
3055 * @nr: number of &struct buffer_heads in the array
3056 * @bhs: array of pointers to &struct buffer_head
3057 *
3058 * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
3059 * requests an I/O operation on them, either a %READ or a %WRITE.  The third
3060 * %READA option is described in the documentation for generic_make_request()
3061 * which ll_rw_block() calls.
3062 *
3063 * This function drops any buffer that it cannot get a lock on (with the
3064 * BH_Lock state bit), any buffer that appears to be clean when doing a write
3065 * request, and any buffer that appears to be up-to-date when doing read
3066 * request.  Further it marks as clean buffers that are processed for
3067 * writing (the buffer cache won't assume that they are actually clean
3068 * until the buffer gets unlocked).
3069 *
3070 * ll_rw_block sets b_end_io to simple completion handler that marks
3071 * the buffer up-to-date (if appropriate), unlocks the buffer and wakes
3072 * any waiters. 
3073 *
3074 * All of the buffers must be for the same device, and must also be a
3075 * multiple of the current approved size for the device.
3076 */
3077void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
3078{
3079        int i;
3080
3081        for (i = 0; i < nr; i++) {
3082                struct buffer_head *bh = bhs[i];
3083
3084                if (!trylock_buffer(bh))
3085                        continue;
3086                if (rw == WRITE) {
3087                        if (test_clear_buffer_dirty(bh)) {
3088                                bh->b_end_io = end_buffer_write_sync;
3089                                get_bh(bh);
3090                                submit_bh(WRITE, bh);
3091                                continue;
3092                        }
3093                } else {
3094                        if (!buffer_uptodate(bh)) {
3095                                bh->b_end_io = end_buffer_read_sync;
3096                                get_bh(bh);
3097                                submit_bh(rw, bh);
3098                                continue;
3099                        }
3100                }
3101                unlock_buffer(bh);
3102        }
3103}
3104EXPORT_SYMBOL(ll_rw_block);
3105
3106void write_dirty_buffer(struct buffer_head *bh, int rw)
3107{
3108        lock_buffer(bh);
3109        if (!test_clear_buffer_dirty(bh)) {
3110                unlock_buffer(bh);
3111                return;
3112        }
3113        bh->b_end_io = end_buffer_write_sync;
3114        get_bh(bh);
3115        submit_bh(rw, bh);
3116}
3117EXPORT_SYMBOL(write_dirty_buffer);
3118
3119/*
3120 * For a data-integrity writeout, we need to wait upon any in-progress I/O
3121 * and then start new I/O and then wait upon it.  The caller must have a ref on
3122 * the buffer_head.
3123 */
3124int __sync_dirty_buffer(struct buffer_head *bh, int rw)
3125{
3126        int ret = 0;
3127
3128        WARN_ON(atomic_read(&bh->b_count) < 1);
3129        lock_buffer(bh);
3130        if (test_clear_buffer_dirty(bh)) {
3131                get_bh(bh);
3132                bh->b_end_io = end_buffer_write_sync;
3133                ret = submit_bh(rw, bh);
3134                wait_on_buffer(bh);
3135                if (!ret && !buffer_uptodate(bh))
3136                        ret = -EIO;
3137        } else {
3138                unlock_buffer(bh);
3139        }
3140        return ret;
3141}
3142EXPORT_SYMBOL(__sync_dirty_buffer);
3143
3144int sync_dirty_buffer(struct buffer_head *bh)
3145{
3146        return __sync_dirty_buffer(bh, WRITE_SYNC);
3147}
3148EXPORT_SYMBOL(sync_dirty_buffer);
3149
3150/*
3151 * try_to_free_buffers() checks if all the buffers on this particular page
3152 * are unused, and releases them if so.
3153 *
3154 * Exclusion against try_to_free_buffers may be obtained by either
3155 * locking the page or by holding its mapping's private_lock.
3156 *
3157 * If the page is dirty but all the buffers are clean then we need to
3158 * be sure to mark the page clean as well.  This is because the page
3159 * may be against a block device, and a later reattachment of buffers
3160 * to a dirty page will set *all* buffers dirty.  Which would corrupt
3161 * filesystem data on the same device.
3162 *
3163 * The same applies to regular filesystem pages: if all the buffers are
3164 * clean then we set the page clean and proceed.  To do that, we require
3165 * total exclusion from __set_page_dirty_buffers().  That is obtained with
3166 * private_lock.
3167 *
3168 * try_to_free_buffers() is non-blocking.
3169 */
3170static inline int buffer_busy(struct buffer_head *bh)
3171{
3172        return atomic_read(&bh->b_count) |
3173                (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
3174}
3175
3176static int
3177drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
3178{
3179        struct buffer_head *head = page_buffers(page);
3180        struct buffer_head *bh;
3181
3182        bh = head;
3183        do {
3184                if (buffer_write_io_error(bh) && page->mapping)
3185                        set_bit(AS_EIO, &page->mapping->flags);
3186                if (buffer_busy(bh))
3187                        goto failed;
3188                bh = bh->b_this_page;
3189        } while (bh != head);
3190
3191        do {
3192                struct buffer_head *next = bh->b_this_page;
3193
3194                if (bh->b_assoc_map)
3195                        __remove_assoc_queue(bh);
3196                bh = next;
3197        } while (bh != head);
3198        *buffers_to_free = head;
3199        __clear_page_buffers(page);
3200        return 1;
3201failed:
3202        return 0;
3203}
3204
3205int try_to_free_buffers(struct page *page)
3206{
3207        struct address_space * const mapping = page->mapping;
3208        struct buffer_head *buffers_to_free = NULL;
3209        int ret = 0;
3210
3211        BUG_ON(!PageLocked(page));
3212        if (PageWriteback(page))
3213                return 0;
3214
3215        if (mapping == NULL) {          /* can this still happen? */
3216                ret = drop_buffers(page, &buffers_to_free);
3217                goto out;
3218        }
3219
3220        spin_lock(&mapping->private_lock);
3221        ret = drop_buffers(page, &buffers_to_free);
3222
3223        /*
3224         * If the filesystem writes its buffers by hand (eg ext3)
3225         * then we can have clean buffers against a dirty page.  We
3226         * clean the page here; otherwise the VM will never notice
3227         * that the filesystem did any IO at all.
3228         *
3229         * Also, during truncate, discard_buffer will have marked all
3230         * the page's buffers clean.  We discover that here and clean
3231         * the page also.
3232         *
3233         * private_lock must be held over this entire operation in order
3234         * to synchronise against __set_page_dirty_buffers and prevent the
3235         * dirty bit from being lost.
3236         */
3237        if (ret)
3238                cancel_dirty_page(page);
3239        spin_unlock(&mapping->private_lock);
3240out:
3241        if (buffers_to_free) {
3242                struct buffer_head *bh = buffers_to_free;
3243
3244                do {
3245                        struct buffer_head *next = bh->b_this_page;
3246                        free_buffer_head(bh);
3247                        bh = next;
3248                } while (bh != buffers_to_free);
3249        }
3250        return ret;
3251}
3252EXPORT_SYMBOL(try_to_free_buffers);
3253
3254/*
3255 * There are no bdflush tunables left.  But distributions are
3256 * still running obsolete flush daemons, so we terminate them here.
3257 *
3258 * Use of bdflush() is deprecated and will be removed in a future kernel.
3259 * The `flush-X' kernel threads fully replace bdflush daemons and this call.
3260 */
3261SYSCALL_DEFINE2(bdflush, int, func, long, data)
3262{
3263        static int msg_count;
3264
3265        if (!capable(CAP_SYS_ADMIN))
3266                return -EPERM;
3267
3268        if (msg_count < 5) {
3269                msg_count++;
3270                printk(KERN_INFO
3271                        "warning: process `%s' used the obsolete bdflush"
3272                        " system call\n", current->comm);
3273                printk(KERN_INFO "Fix your initscripts?\n");
3274        }
3275
3276        if (func == 1)
3277                do_exit(0);
3278        return 0;
3279}
3280
3281/*
3282 * Buffer-head allocation
3283 */
3284static struct kmem_cache *bh_cachep __read_mostly;
3285
3286/*
3287 * Once the number of bh's in the machine exceeds this level, we start
3288 * stripping them in writeback.
3289 */
3290static unsigned long max_buffer_heads;
3291
3292int buffer_heads_over_limit;
3293
3294struct bh_accounting {
3295        int nr;                 /* Number of live bh's */
3296        int ratelimit;          /* Limit cacheline bouncing */
3297};
3298
3299static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
3300
3301static void recalc_bh_state(void)
3302{
3303        int i;
3304        int tot = 0;
3305
3306        if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096)
3307                return;
3308        __this_cpu_write(bh_accounting.ratelimit, 0);
3309        for_each_online_cpu(i)
3310                tot += per_cpu(bh_accounting, i).nr;
3311        buffer_heads_over_limit = (tot > max_buffer_heads);
3312}
3313
3314struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
3315{
3316        struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
3317        if (ret) {
3318                INIT_LIST_HEAD(&ret->b_assoc_buffers);
3319                preempt_disable();
3320                __this_cpu_inc(bh_accounting.nr);
3321                recalc_bh_state();
3322                preempt_enable();
3323        }
3324        return ret;
3325}
3326EXPORT_SYMBOL(alloc_buffer_head);
3327
3328void free_buffer_head(struct buffer_head *bh)
3329{
3330        BUG_ON(!list_empty(&bh->b_assoc_buffers));
3331        kmem_cache_free(bh_cachep, bh);
3332        preempt_disable();
3333        __this_cpu_dec(bh_accounting.nr);
3334        recalc_bh_state();
3335        preempt_enable();
3336}
3337EXPORT_SYMBOL(free_buffer_head);
3338
3339static void buffer_exit_cpu(int cpu)
3340{
3341        int i;
3342        struct bh_lru *b = &per_cpu(bh_lrus, cpu);
3343
3344        for (i = 0; i < BH_LRU_SIZE; i++) {
3345                brelse(b->bhs[i]);
3346                b->bhs[i] = NULL;
3347        }
3348        this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
3349        per_cpu(bh_accounting, cpu).nr = 0;
3350}
3351
3352static int buffer_cpu_notify(struct notifier_block *self,
3353                              unsigned long action, void *hcpu)
3354{
3355        if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
3356                buffer_exit_cpu((unsigned long)hcpu);
3357        return NOTIFY_OK;
3358}
3359
3360/**
3361 * bh_uptodate_or_lock - Test whether the buffer is uptodate
3362 * @bh: struct buffer_head
3363 *
3364 * Return true if the buffer is up-to-date and false,
3365 * with the buffer locked, if not.
3366 */
3367int bh_uptodate_or_lock(struct buffer_head *bh)
3368{
3369        if (!buffer_uptodate(bh)) {
3370                lock_buffer(bh);
3371                if (!buffer_uptodate(bh))
3372                        return 0;
3373                unlock_buffer(bh);
3374        }
3375        return 1;
3376}
3377EXPORT_SYMBOL(bh_uptodate_or_lock);
3378
3379/**
3380 * bh_submit_read - Submit a locked buffer for reading
3381 * @bh: struct buffer_head
3382 *
3383 * Returns zero on success and -EIO on error.
3384 */
3385int bh_submit_read(struct buffer_head *bh)
3386{
3387        BUG_ON(!buffer_locked(bh));
3388
3389        if (buffer_uptodate(bh)) {
3390                unlock_buffer(bh);
3391                return 0;
3392        }
3393
3394        get_bh(bh);
3395        bh->b_end_io = end_buffer_read_sync;
3396        submit_bh(READ, bh);
3397        wait_on_buffer(bh);
3398        if (buffer_uptodate(bh))
3399                return 0;
3400        return -EIO;
3401}
3402EXPORT_SYMBOL(bh_submit_read);
3403
3404void __init buffer_init(void)
3405{
3406        unsigned long nrpages;
3407
3408        bh_cachep = kmem_cache_create("buffer_head",
3409                        sizeof(struct buffer_head), 0,
3410                                (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
3411                                SLAB_MEM_SPREAD),
3412                                NULL);
3413
3414        /*
3415         * Limit the bh occupancy to 10% of ZONE_NORMAL
3416         */
3417        nrpages = (nr_free_buffer_pages() * 10) / 100;
3418        max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3419        hotcpu_notifier(buffer_cpu_notify, 0);
3420}
3421