linux/fs/buffer.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 *  linux/fs/buffer.c
   4 *
   5 *  Copyright (C) 1991, 1992, 2002  Linus Torvalds
   6 */
   7
   8/*
   9 * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
  10 *
  11 * Removed a lot of unnecessary code and simplified things now that
  12 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
  13 *
  14 * Speed up hash, lru, and free list operations.  Use gfp() for allocating
  15 * hash table, use SLAB cache for buffer heads. SMP threading.  -DaveM
  16 *
  17 * Added 32k buffer block sizes - these are required older ARM systems. - RMK
  18 *
  19 * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
  20 */
  21
  22#include <linux/kernel.h>
  23#include <linux/sched/signal.h>
  24#include <linux/syscalls.h>
  25#include <linux/fs.h>
  26#include <linux/iomap.h>
  27#include <linux/mm.h>
  28#include <linux/percpu.h>
  29#include <linux/slab.h>
  30#include <linux/capability.h>
  31#include <linux/blkdev.h>
  32#include <linux/file.h>
  33#include <linux/quotaops.h>
  34#include <linux/highmem.h>
  35#include <linux/export.h>
  36#include <linux/backing-dev.h>
  37#include <linux/writeback.h>
  38#include <linux/hash.h>
  39#include <linux/suspend.h>
  40#include <linux/buffer_head.h>
  41#include <linux/task_io_accounting_ops.h>
  42#include <linux/bio.h>
  43#include <linux/cpu.h>
  44#include <linux/bitops.h>
  45#include <linux/mpage.h>
  46#include <linux/bit_spinlock.h>
  47#include <linux/pagevec.h>
  48#include <linux/sched/mm.h>
  49#include <trace/events/block.h>
  50#include <linux/fscrypt.h>
  51
  52#include "internal.h"
  53
  54static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
  55static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
  56                         enum rw_hint hint, struct writeback_control *wbc);
  57
  58#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
  59
  60inline void touch_buffer(struct buffer_head *bh)
  61{
  62        trace_block_touch_buffer(bh);
  63        mark_page_accessed(bh->b_page);
  64}
  65EXPORT_SYMBOL(touch_buffer);
  66
  67void __lock_buffer(struct buffer_head *bh)
  68{
  69        wait_on_bit_lock_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
  70}
  71EXPORT_SYMBOL(__lock_buffer);
  72
  73void unlock_buffer(struct buffer_head *bh)
  74{
  75        clear_bit_unlock(BH_Lock, &bh->b_state);
  76        smp_mb__after_atomic();
  77        wake_up_bit(&bh->b_state, BH_Lock);
  78}
  79EXPORT_SYMBOL(unlock_buffer);
  80
  81/*
  82 * Returns if the page has dirty or writeback buffers. If all the buffers
  83 * are unlocked and clean then the PageDirty information is stale. If
  84 * any of the pages are locked, it is assumed they are locked for IO.
  85 */
  86void buffer_check_dirty_writeback(struct page *page,
  87                                     bool *dirty, bool *writeback)
  88{
  89        struct buffer_head *head, *bh;
  90        *dirty = false;
  91        *writeback = false;
  92
  93        BUG_ON(!PageLocked(page));
  94
  95        if (!page_has_buffers(page))
  96                return;
  97
  98        if (PageWriteback(page))
  99                *writeback = true;
 100
 101        head = page_buffers(page);
 102        bh = head;
 103        do {
 104                if (buffer_locked(bh))
 105                        *writeback = true;
 106
 107                if (buffer_dirty(bh))
 108                        *dirty = true;
 109
 110                bh = bh->b_this_page;
 111        } while (bh != head);
 112}
 113EXPORT_SYMBOL(buffer_check_dirty_writeback);
 114
 115/*
 116 * Block until a buffer comes unlocked.  This doesn't stop it
 117 * from becoming locked again - you have to lock it yourself
 118 * if you want to preserve its state.
 119 */
 120void __wait_on_buffer(struct buffer_head * bh)
 121{
 122        wait_on_bit_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
 123}
 124EXPORT_SYMBOL(__wait_on_buffer);
 125
 126static void buffer_io_error(struct buffer_head *bh, char *msg)
 127{
 128        if (!test_bit(BH_Quiet, &bh->b_state))
 129                printk_ratelimited(KERN_ERR
 130                        "Buffer I/O error on dev %pg, logical block %llu%s\n",
 131                        bh->b_bdev, (unsigned long long)bh->b_blocknr, msg);
 132}
 133
 134/*
 135 * End-of-IO handler helper function which does not touch the bh after
 136 * unlocking it.
 137 * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
 138 * a race there is benign: unlock_buffer() only use the bh's address for
 139 * hashing after unlocking the buffer, so it doesn't actually touch the bh
 140 * itself.
 141 */
 142static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
 143{
 144        if (uptodate) {
 145                set_buffer_uptodate(bh);
 146        } else {
 147                /* This happens, due to failed read-ahead attempts. */
 148                clear_buffer_uptodate(bh);
 149        }
 150        unlock_buffer(bh);
 151}
 152
 153/*
 154 * Default synchronous end-of-IO handler..  Just mark it up-to-date and
 155 * unlock the buffer. This is what ll_rw_block uses too.
 156 */
 157void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
 158{
 159        __end_buffer_read_notouch(bh, uptodate);
 160        put_bh(bh);
 161}
 162EXPORT_SYMBOL(end_buffer_read_sync);
 163
 164void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 165{
 166        if (uptodate) {
 167                set_buffer_uptodate(bh);
 168        } else {
 169                buffer_io_error(bh, ", lost sync page write");
 170                mark_buffer_write_io_error(bh);
 171                clear_buffer_uptodate(bh);
 172        }
 173        unlock_buffer(bh);
 174        put_bh(bh);
 175}
 176EXPORT_SYMBOL(end_buffer_write_sync);
 177
 178/*
 179 * Various filesystems appear to want __find_get_block to be non-blocking.
 180 * But it's the page lock which protects the buffers.  To get around this,
 181 * we get exclusion from try_to_free_buffers with the blockdev mapping's
 182 * private_lock.
 183 *
 184 * Hack idea: for the blockdev mapping, private_lock contention
 185 * may be quite high.  This code could TryLock the page, and if that
 186 * succeeds, there is no need to take private_lock.
 187 */
 188static struct buffer_head *
 189__find_get_block_slow(struct block_device *bdev, sector_t block)
 190{
 191        struct inode *bd_inode = bdev->bd_inode;
 192        struct address_space *bd_mapping = bd_inode->i_mapping;
 193        struct buffer_head *ret = NULL;
 194        pgoff_t index;
 195        struct buffer_head *bh;
 196        struct buffer_head *head;
 197        struct page *page;
 198        int all_mapped = 1;
 199        static DEFINE_RATELIMIT_STATE(last_warned, HZ, 1);
 200
 201        index = block >> (PAGE_SHIFT - bd_inode->i_blkbits);
 202        page = find_get_page_flags(bd_mapping, index, FGP_ACCESSED);
 203        if (!page)
 204                goto out;
 205
 206        spin_lock(&bd_mapping->private_lock);
 207        if (!page_has_buffers(page))
 208                goto out_unlock;
 209        head = page_buffers(page);
 210        bh = head;
 211        do {
 212                if (!buffer_mapped(bh))
 213                        all_mapped = 0;
 214                else if (bh->b_blocknr == block) {
 215                        ret = bh;
 216                        get_bh(bh);
 217                        goto out_unlock;
 218                }
 219                bh = bh->b_this_page;
 220        } while (bh != head);
 221
 222        /* we might be here because some of the buffers on this page are
 223         * not mapped.  This is due to various races between
 224         * file io on the block device and getblk.  It gets dealt with
 225         * elsewhere, don't buffer_error if we had some unmapped buffers
 226         */
 227        ratelimit_set_flags(&last_warned, RATELIMIT_MSG_ON_RELEASE);
 228        if (all_mapped && __ratelimit(&last_warned)) {
 229                printk("__find_get_block_slow() failed. block=%llu, "
 230                       "b_blocknr=%llu, b_state=0x%08lx, b_size=%zu, "
 231                       "device %pg blocksize: %d\n",
 232                       (unsigned long long)block,
 233                       (unsigned long long)bh->b_blocknr,
 234                       bh->b_state, bh->b_size, bdev,
 235                       1 << bd_inode->i_blkbits);
 236        }
 237out_unlock:
 238        spin_unlock(&bd_mapping->private_lock);
 239        put_page(page);
 240out:
 241        return ret;
 242}
 243
 244static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 245{
 246        unsigned long flags;
 247        struct buffer_head *first;
 248        struct buffer_head *tmp;
 249        struct page *page;
 250        int page_uptodate = 1;
 251
 252        BUG_ON(!buffer_async_read(bh));
 253
 254        page = bh->b_page;
 255        if (uptodate) {
 256                set_buffer_uptodate(bh);
 257        } else {
 258                clear_buffer_uptodate(bh);
 259                buffer_io_error(bh, ", async page read");
 260                SetPageError(page);
 261        }
 262
 263        /*
 264         * Be _very_ careful from here on. Bad things can happen if
 265         * two buffer heads end IO at almost the same time and both
 266         * decide that the page is now completely done.
 267         */
 268        first = page_buffers(page);
 269        spin_lock_irqsave(&first->b_uptodate_lock, flags);
 270        clear_buffer_async_read(bh);
 271        unlock_buffer(bh);
 272        tmp = bh;
 273        do {
 274                if (!buffer_uptodate(tmp))
 275                        page_uptodate = 0;
 276                if (buffer_async_read(tmp)) {
 277                        BUG_ON(!buffer_locked(tmp));
 278                        goto still_busy;
 279                }
 280                tmp = tmp->b_this_page;
 281        } while (tmp != bh);
 282        spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 283
 284        /*
 285         * If none of the buffers had errors and they are all
 286         * uptodate then we can set the page uptodate.
 287         */
 288        if (page_uptodate && !PageError(page))
 289                SetPageUptodate(page);
 290        unlock_page(page);
 291        return;
 292
 293still_busy:
 294        spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 295        return;
 296}
 297
 298struct decrypt_bh_ctx {
 299        struct work_struct work;
 300        struct buffer_head *bh;
 301};
 302
 303static void decrypt_bh(struct work_struct *work)
 304{
 305        struct decrypt_bh_ctx *ctx =
 306                container_of(work, struct decrypt_bh_ctx, work);
 307        struct buffer_head *bh = ctx->bh;
 308        int err;
 309
 310        err = fscrypt_decrypt_pagecache_blocks(bh->b_page, bh->b_size,
 311                                               bh_offset(bh));
 312        end_buffer_async_read(bh, err == 0);
 313        kfree(ctx);
 314}
 315
 316/*
 317 * I/O completion handler for block_read_full_page() - pages
 318 * which come unlocked at the end of I/O.
 319 */
 320static void end_buffer_async_read_io(struct buffer_head *bh, int uptodate)
 321{
 322        /* Decrypt if needed */
 323        if (uptodate &&
 324            fscrypt_inode_uses_fs_layer_crypto(bh->b_page->mapping->host)) {
 325                struct decrypt_bh_ctx *ctx = kmalloc(sizeof(*ctx), GFP_ATOMIC);
 326
 327                if (ctx) {
 328                        INIT_WORK(&ctx->work, decrypt_bh);
 329                        ctx->bh = bh;
 330                        fscrypt_enqueue_decrypt_work(&ctx->work);
 331                        return;
 332                }
 333                uptodate = 0;
 334        }
 335        end_buffer_async_read(bh, uptodate);
 336}
 337
 338/*
 339 * Completion handler for block_write_full_page() - pages which are unlocked
 340 * during I/O, and which have PageWriteback cleared upon I/O completion.
 341 */
 342void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 343{
 344        unsigned long flags;
 345        struct buffer_head *first;
 346        struct buffer_head *tmp;
 347        struct page *page;
 348
 349        BUG_ON(!buffer_async_write(bh));
 350
 351        page = bh->b_page;
 352        if (uptodate) {
 353                set_buffer_uptodate(bh);
 354        } else {
 355                buffer_io_error(bh, ", lost async page write");
 356                mark_buffer_write_io_error(bh);
 357                clear_buffer_uptodate(bh);
 358                SetPageError(page);
 359        }
 360
 361        first = page_buffers(page);
 362        spin_lock_irqsave(&first->b_uptodate_lock, flags);
 363
 364        clear_buffer_async_write(bh);
 365        unlock_buffer(bh);
 366        tmp = bh->b_this_page;
 367        while (tmp != bh) {
 368                if (buffer_async_write(tmp)) {
 369                        BUG_ON(!buffer_locked(tmp));
 370                        goto still_busy;
 371                }
 372                tmp = tmp->b_this_page;
 373        }
 374        spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 375        end_page_writeback(page);
 376        return;
 377
 378still_busy:
 379        spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 380        return;
 381}
 382EXPORT_SYMBOL(end_buffer_async_write);
 383
 384/*
 385 * If a page's buffers are under async readin (end_buffer_async_read
 386 * completion) then there is a possibility that another thread of
 387 * control could lock one of the buffers after it has completed
 388 * but while some of the other buffers have not completed.  This
 389 * locked buffer would confuse end_buffer_async_read() into not unlocking
 390 * the page.  So the absence of BH_Async_Read tells end_buffer_async_read()
 391 * that this buffer is not under async I/O.
 392 *
 393 * The page comes unlocked when it has no locked buffer_async buffers
 394 * left.
 395 *
 396 * PageLocked prevents anyone starting new async I/O reads any of
 397 * the buffers.
 398 *
 399 * PageWriteback is used to prevent simultaneous writeout of the same
 400 * page.
 401 *
 402 * PageLocked prevents anyone from starting writeback of a page which is
 403 * under read I/O (PageWriteback is only ever set against a locked page).
 404 */
 405static void mark_buffer_async_read(struct buffer_head *bh)
 406{
 407        bh->b_end_io = end_buffer_async_read_io;
 408        set_buffer_async_read(bh);
 409}
 410
 411static void mark_buffer_async_write_endio(struct buffer_head *bh,
 412                                          bh_end_io_t *handler)
 413{
 414        bh->b_end_io = handler;
 415        set_buffer_async_write(bh);
 416}
 417
 418void mark_buffer_async_write(struct buffer_head *bh)
 419{
 420        mark_buffer_async_write_endio(bh, end_buffer_async_write);
 421}
 422EXPORT_SYMBOL(mark_buffer_async_write);
 423
 424
 425/*
 426 * fs/buffer.c contains helper functions for buffer-backed address space's
 427 * fsync functions.  A common requirement for buffer-based filesystems is
 428 * that certain data from the backing blockdev needs to be written out for
 429 * a successful fsync().  For example, ext2 indirect blocks need to be
 430 * written back and waited upon before fsync() returns.
 431 *
 432 * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
 433 * inode_has_buffers() and invalidate_inode_buffers() are provided for the
 434 * management of a list of dependent buffers at ->i_mapping->private_list.
 435 *
 436 * Locking is a little subtle: try_to_free_buffers() will remove buffers
 437 * from their controlling inode's queue when they are being freed.  But
 438 * try_to_free_buffers() will be operating against the *blockdev* mapping
 439 * at the time, not against the S_ISREG file which depends on those buffers.
 440 * So the locking for private_list is via the private_lock in the address_space
 441 * which backs the buffers.  Which is different from the address_space 
 442 * against which the buffers are listed.  So for a particular address_space,
 443 * mapping->private_lock does *not* protect mapping->private_list!  In fact,
 444 * mapping->private_list will always be protected by the backing blockdev's
 445 * ->private_lock.
 446 *
 447 * Which introduces a requirement: all buffers on an address_space's
 448 * ->private_list must be from the same address_space: the blockdev's.
 449 *
 450 * address_spaces which do not place buffers at ->private_list via these
 451 * utility functions are free to use private_lock and private_list for
 452 * whatever they want.  The only requirement is that list_empty(private_list)
 453 * be true at clear_inode() time.
 454 *
 455 * FIXME: clear_inode should not call invalidate_inode_buffers().  The
 456 * filesystems should do that.  invalidate_inode_buffers() should just go
 457 * BUG_ON(!list_empty).
 458 *
 459 * FIXME: mark_buffer_dirty_inode() is a data-plane operation.  It should
 460 * take an address_space, not an inode.  And it should be called
 461 * mark_buffer_dirty_fsync() to clearly define why those buffers are being
 462 * queued up.
 463 *
 464 * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
 465 * list if it is already on a list.  Because if the buffer is on a list,
 466 * it *must* already be on the right one.  If not, the filesystem is being
 467 * silly.  This will save a ton of locking.  But first we have to ensure
 468 * that buffers are taken *off* the old inode's list when they are freed
 469 * (presumably in truncate).  That requires careful auditing of all
 470 * filesystems (do it inside bforget()).  It could also be done by bringing
 471 * b_inode back.
 472 */
 473
 474/*
 475 * The buffer's backing address_space's private_lock must be held
 476 */
 477static void __remove_assoc_queue(struct buffer_head *bh)
 478{
 479        list_del_init(&bh->b_assoc_buffers);
 480        WARN_ON(!bh->b_assoc_map);
 481        bh->b_assoc_map = NULL;
 482}
 483
 484int inode_has_buffers(struct inode *inode)
 485{
 486        return !list_empty(&inode->i_data.private_list);
 487}
 488
 489/*
 490 * osync is designed to support O_SYNC io.  It waits synchronously for
 491 * all already-submitted IO to complete, but does not queue any new
 492 * writes to the disk.
 493 *
 494 * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
 495 * you dirty the buffers, and then use osync_inode_buffers to wait for
 496 * completion.  Any other dirty buffers which are not yet queued for
 497 * write will not be flushed to disk by the osync.
 498 */
 499static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
 500{
 501        struct buffer_head *bh;
 502        struct list_head *p;
 503        int err = 0;
 504
 505        spin_lock(lock);
 506repeat:
 507        list_for_each_prev(p, list) {
 508                bh = BH_ENTRY(p);
 509                if (buffer_locked(bh)) {
 510                        get_bh(bh);
 511                        spin_unlock(lock);
 512                        wait_on_buffer(bh);
 513                        if (!buffer_uptodate(bh))
 514                                err = -EIO;
 515                        brelse(bh);
 516                        spin_lock(lock);
 517                        goto repeat;
 518                }
 519        }
 520        spin_unlock(lock);
 521        return err;
 522}
 523
 524void emergency_thaw_bdev(struct super_block *sb)
 525{
 526        while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
 527                printk(KERN_WARNING "Emergency Thaw on %pg\n", sb->s_bdev);
 528}
 529
 530/**
 531 * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
 532 * @mapping: the mapping which wants those buffers written
 533 *
 534 * Starts I/O against the buffers at mapping->private_list, and waits upon
 535 * that I/O.
 536 *
 537 * Basically, this is a convenience function for fsync().
 538 * @mapping is a file or directory which needs those buffers to be written for
 539 * a successful fsync().
 540 */
 541int sync_mapping_buffers(struct address_space *mapping)
 542{
 543        struct address_space *buffer_mapping = mapping->private_data;
 544
 545        if (buffer_mapping == NULL || list_empty(&mapping->private_list))
 546                return 0;
 547
 548        return fsync_buffers_list(&buffer_mapping->private_lock,
 549                                        &mapping->private_list);
 550}
 551EXPORT_SYMBOL(sync_mapping_buffers);
 552
 553/*
 554 * Called when we've recently written block `bblock', and it is known that
 555 * `bblock' was for a buffer_boundary() buffer.  This means that the block at
 556 * `bblock + 1' is probably a dirty indirect block.  Hunt it down and, if it's
 557 * dirty, schedule it for IO.  So that indirects merge nicely with their data.
 558 */
 559void write_boundary_block(struct block_device *bdev,
 560                        sector_t bblock, unsigned blocksize)
 561{
 562        struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
 563        if (bh) {
 564                if (buffer_dirty(bh))
 565                        ll_rw_block(REQ_OP_WRITE, 0, 1, &bh);
 566                put_bh(bh);
 567        }
 568}
 569
 570void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
 571{
 572        struct address_space *mapping = inode->i_mapping;
 573        struct address_space *buffer_mapping = bh->b_page->mapping;
 574
 575        mark_buffer_dirty(bh);
 576        if (!mapping->private_data) {
 577                mapping->private_data = buffer_mapping;
 578        } else {
 579                BUG_ON(mapping->private_data != buffer_mapping);
 580        }
 581        if (!bh->b_assoc_map) {
 582                spin_lock(&buffer_mapping->private_lock);
 583                list_move_tail(&bh->b_assoc_buffers,
 584                                &mapping->private_list);
 585                bh->b_assoc_map = mapping;
 586                spin_unlock(&buffer_mapping->private_lock);
 587        }
 588}
 589EXPORT_SYMBOL(mark_buffer_dirty_inode);
 590
 591/*
 592 * Mark the page dirty, and set it dirty in the page cache, and mark the inode
 593 * dirty.
 594 *
 595 * If warn is true, then emit a warning if the page is not uptodate and has
 596 * not been truncated.
 597 *
 598 * The caller must hold lock_page_memcg().
 599 */
 600void __set_page_dirty(struct page *page, struct address_space *mapping,
 601                             int warn)
 602{
 603        unsigned long flags;
 604
 605        xa_lock_irqsave(&mapping->i_pages, flags);
 606        if (page->mapping) {    /* Race with truncate? */
 607                WARN_ON_ONCE(warn && !PageUptodate(page));
 608                account_page_dirtied(page, mapping);
 609                __xa_set_mark(&mapping->i_pages, page_index(page),
 610                                PAGECACHE_TAG_DIRTY);
 611        }
 612        xa_unlock_irqrestore(&mapping->i_pages, flags);
 613}
 614EXPORT_SYMBOL_GPL(__set_page_dirty);
 615
 616/*
 617 * Add a page to the dirty page list.
 618 *
 619 * It is a sad fact of life that this function is called from several places
 620 * deeply under spinlocking.  It may not sleep.
 621 *
 622 * If the page has buffers, the uptodate buffers are set dirty, to preserve
 623 * dirty-state coherency between the page and the buffers.  It the page does
 624 * not have buffers then when they are later attached they will all be set
 625 * dirty.
 626 *
 627 * The buffers are dirtied before the page is dirtied.  There's a small race
 628 * window in which a writepage caller may see the page cleanness but not the
 629 * buffer dirtiness.  That's fine.  If this code were to set the page dirty
 630 * before the buffers, a concurrent writepage caller could clear the page dirty
 631 * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
 632 * page on the dirty page list.
 633 *
 634 * We use private_lock to lock against try_to_free_buffers while using the
 635 * page's buffer list.  Also use this to protect against clean buffers being
 636 * added to the page after it was set dirty.
 637 *
 638 * FIXME: may need to call ->reservepage here as well.  That's rather up to the
 639 * address_space though.
 640 */
 641int __set_page_dirty_buffers(struct page *page)
 642{
 643        int newly_dirty;
 644        struct address_space *mapping = page_mapping(page);
 645
 646        if (unlikely(!mapping))
 647                return !TestSetPageDirty(page);
 648
 649        spin_lock(&mapping->private_lock);
 650        if (page_has_buffers(page)) {
 651                struct buffer_head *head = page_buffers(page);
 652                struct buffer_head *bh = head;
 653
 654                do {
 655                        set_buffer_dirty(bh);
 656                        bh = bh->b_this_page;
 657                } while (bh != head);
 658        }
 659        /*
 660         * Lock out page->mem_cgroup migration to keep PageDirty
 661         * synchronized with per-memcg dirty page counters.
 662         */
 663        lock_page_memcg(page);
 664        newly_dirty = !TestSetPageDirty(page);
 665        spin_unlock(&mapping->private_lock);
 666
 667        if (newly_dirty)
 668                __set_page_dirty(page, mapping, 1);
 669
 670        unlock_page_memcg(page);
 671
 672        if (newly_dirty)
 673                __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 674
 675        return newly_dirty;
 676}
 677EXPORT_SYMBOL(__set_page_dirty_buffers);
 678
 679/*
 680 * Write out and wait upon a list of buffers.
 681 *
 682 * We have conflicting pressures: we want to make sure that all
 683 * initially dirty buffers get waited on, but that any subsequently
 684 * dirtied buffers don't.  After all, we don't want fsync to last
 685 * forever if somebody is actively writing to the file.
 686 *
 687 * Do this in two main stages: first we copy dirty buffers to a
 688 * temporary inode list, queueing the writes as we go.  Then we clean
 689 * up, waiting for those writes to complete.
 690 * 
 691 * During this second stage, any subsequent updates to the file may end
 692 * up refiling the buffer on the original inode's dirty list again, so
 693 * there is a chance we will end up with a buffer queued for write but
 694 * not yet completed on that list.  So, as a final cleanup we go through
 695 * the osync code to catch these locked, dirty buffers without requeuing
 696 * any newly dirty buffers for write.
 697 */
 698static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 699{
 700        struct buffer_head *bh;
 701        struct list_head tmp;
 702        struct address_space *mapping;
 703        int err = 0, err2;
 704        struct blk_plug plug;
 705
 706        INIT_LIST_HEAD(&tmp);
 707        blk_start_plug(&plug);
 708
 709        spin_lock(lock);
 710        while (!list_empty(list)) {
 711                bh = BH_ENTRY(list->next);
 712                mapping = bh->b_assoc_map;
 713                __remove_assoc_queue(bh);
 714                /* Avoid race with mark_buffer_dirty_inode() which does
 715                 * a lockless check and we rely on seeing the dirty bit */
 716                smp_mb();
 717                if (buffer_dirty(bh) || buffer_locked(bh)) {
 718                        list_add(&bh->b_assoc_buffers, &tmp);
 719                        bh->b_assoc_map = mapping;
 720                        if (buffer_dirty(bh)) {
 721                                get_bh(bh);
 722                                spin_unlock(lock);
 723                                /*
 724                                 * Ensure any pending I/O completes so that
 725                                 * write_dirty_buffer() actually writes the
 726                                 * current contents - it is a noop if I/O is
 727                                 * still in flight on potentially older
 728                                 * contents.
 729                                 */
 730                                write_dirty_buffer(bh, REQ_SYNC);
 731
 732                                /*
 733                                 * Kick off IO for the previous mapping. Note
 734                                 * that we will not run the very last mapping,
 735                                 * wait_on_buffer() will do that for us
 736                                 * through sync_buffer().
 737                                 */
 738                                brelse(bh);
 739                                spin_lock(lock);
 740                        }
 741                }
 742        }
 743
 744        spin_unlock(lock);
 745        blk_finish_plug(&plug);
 746        spin_lock(lock);
 747
 748        while (!list_empty(&tmp)) {
 749                bh = BH_ENTRY(tmp.prev);
 750                get_bh(bh);
 751                mapping = bh->b_assoc_map;
 752                __remove_assoc_queue(bh);
 753                /* Avoid race with mark_buffer_dirty_inode() which does
 754                 * a lockless check and we rely on seeing the dirty bit */
 755                smp_mb();
 756                if (buffer_dirty(bh)) {
 757                        list_add(&bh->b_assoc_buffers,
 758                                 &mapping->private_list);
 759                        bh->b_assoc_map = mapping;
 760                }
 761                spin_unlock(lock);
 762                wait_on_buffer(bh);
 763                if (!buffer_uptodate(bh))
 764                        err = -EIO;
 765                brelse(bh);
 766                spin_lock(lock);
 767        }
 768        
 769        spin_unlock(lock);
 770        err2 = osync_buffers_list(lock, list);
 771        if (err)
 772                return err;
 773        else
 774                return err2;
 775}
 776
 777/*
 778 * Invalidate any and all dirty buffers on a given inode.  We are
 779 * probably unmounting the fs, but that doesn't mean we have already
 780 * done a sync().  Just drop the buffers from the inode list.
 781 *
 782 * NOTE: we take the inode's blockdev's mapping's private_lock.  Which
 783 * assumes that all the buffers are against the blockdev.  Not true
 784 * for reiserfs.
 785 */
 786void invalidate_inode_buffers(struct inode *inode)
 787{
 788        if (inode_has_buffers(inode)) {
 789                struct address_space *mapping = &inode->i_data;
 790                struct list_head *list = &mapping->private_list;
 791                struct address_space *buffer_mapping = mapping->private_data;
 792
 793                spin_lock(&buffer_mapping->private_lock);
 794                while (!list_empty(list))
 795                        __remove_assoc_queue(BH_ENTRY(list->next));
 796                spin_unlock(&buffer_mapping->private_lock);
 797        }
 798}
 799EXPORT_SYMBOL(invalidate_inode_buffers);
 800
 801/*
 802 * Remove any clean buffers from the inode's buffer list.  This is called
 803 * when we're trying to free the inode itself.  Those buffers can pin it.
 804 *
 805 * Returns true if all buffers were removed.
 806 */
 807int remove_inode_buffers(struct inode *inode)
 808{
 809        int ret = 1;
 810
 811        if (inode_has_buffers(inode)) {
 812                struct address_space *mapping = &inode->i_data;
 813                struct list_head *list = &mapping->private_list;
 814                struct address_space *buffer_mapping = mapping->private_data;
 815
 816                spin_lock(&buffer_mapping->private_lock);
 817                while (!list_empty(list)) {
 818                        struct buffer_head *bh = BH_ENTRY(list->next);
 819                        if (buffer_dirty(bh)) {
 820                                ret = 0;
 821                                break;
 822                        }
 823                        __remove_assoc_queue(bh);
 824                }
 825                spin_unlock(&buffer_mapping->private_lock);
 826        }
 827        return ret;
 828}
 829
 830/*
 831 * Create the appropriate buffers when given a page for data area and
 832 * the size of each buffer.. Use the bh->b_this_page linked list to
 833 * follow the buffers created.  Return NULL if unable to create more
 834 * buffers.
 835 *
 836 * The retry flag is used to differentiate async IO (paging, swapping)
 837 * which may not fail from ordinary buffer allocations.
 838 */
 839struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
 840                bool retry)
 841{
 842        struct buffer_head *bh, *head;
 843        gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT;
 844        long offset;
 845        struct mem_cgroup *memcg;
 846
 847        if (retry)
 848                gfp |= __GFP_NOFAIL;
 849
 850        memcg = get_mem_cgroup_from_page(page);
 851        memalloc_use_memcg(memcg);
 852
 853        head = NULL;
 854        offset = PAGE_SIZE;
 855        while ((offset -= size) >= 0) {
 856                bh = alloc_buffer_head(gfp);
 857                if (!bh)
 858                        goto no_grow;
 859
 860                bh->b_this_page = head;
 861                bh->b_blocknr = -1;
 862                head = bh;
 863
 864                bh->b_size = size;
 865
 866                /* Link the buffer to its page */
 867                set_bh_page(bh, page, offset);
 868        }
 869out:
 870        memalloc_unuse_memcg();
 871        mem_cgroup_put(memcg);
 872        return head;
 873/*
 874 * In case anything failed, we just free everything we got.
 875 */
 876no_grow:
 877        if (head) {
 878                do {
 879                        bh = head;
 880                        head = head->b_this_page;
 881                        free_buffer_head(bh);
 882                } while (head);
 883        }
 884
 885        goto out;
 886}
 887EXPORT_SYMBOL_GPL(alloc_page_buffers);
 888
 889static inline void
 890link_dev_buffers(struct page *page, struct buffer_head *head)
 891{
 892        struct buffer_head *bh, *tail;
 893
 894        bh = head;
 895        do {
 896                tail = bh;
 897                bh = bh->b_this_page;
 898        } while (bh);
 899        tail->b_this_page = head;
 900        attach_page_private(page, head);
 901}
 902
 903static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size)
 904{
 905        sector_t retval = ~((sector_t)0);
 906        loff_t sz = i_size_read(bdev->bd_inode);
 907
 908        if (sz) {
 909                unsigned int sizebits = blksize_bits(size);
 910                retval = (sz >> sizebits);
 911        }
 912        return retval;
 913}
 914
 915/*
 916 * Initialise the state of a blockdev page's buffers.
 917 */ 
 918static sector_t
 919init_page_buffers(struct page *page, struct block_device *bdev,
 920                        sector_t block, int size)
 921{
 922        struct buffer_head *head = page_buffers(page);
 923        struct buffer_head *bh = head;
 924        int uptodate = PageUptodate(page);
 925        sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode), size);
 926
 927        do {
 928                if (!buffer_mapped(bh)) {
 929                        bh->b_end_io = NULL;
 930                        bh->b_private = NULL;
 931                        bh->b_bdev = bdev;
 932                        bh->b_blocknr = block;
 933                        if (uptodate)
 934                                set_buffer_uptodate(bh);
 935                        if (block < end_block)
 936                                set_buffer_mapped(bh);
 937                }
 938                block++;
 939                bh = bh->b_this_page;
 940        } while (bh != head);
 941
 942        /*
 943         * Caller needs to validate requested block against end of device.
 944         */
 945        return end_block;
 946}
 947
 948/*
 949 * Create the page-cache page that contains the requested block.
 950 *
 951 * This is used purely for blockdev mappings.
 952 */
 953static int
 954grow_dev_page(struct block_device *bdev, sector_t block,
 955              pgoff_t index, int size, int sizebits, gfp_t gfp)
 956{
 957        struct inode *inode = bdev->bd_inode;
 958        struct page *page;
 959        struct buffer_head *bh;
 960        sector_t end_block;
 961        int ret = 0;
 962        gfp_t gfp_mask;
 963
 964        gfp_mask = mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS) | gfp;
 965
 966        /*
 967         * XXX: __getblk_slow() can not really deal with failure and
 968         * will endlessly loop on improvised global reclaim.  Prefer
 969         * looping in the allocator rather than here, at least that
 970         * code knows what it's doing.
 971         */
 972        gfp_mask |= __GFP_NOFAIL;
 973
 974        page = find_or_create_page(inode->i_mapping, index, gfp_mask);
 975
 976        BUG_ON(!PageLocked(page));
 977
 978        if (page_has_buffers(page)) {
 979                bh = page_buffers(page);
 980                if (bh->b_size == size) {
 981                        end_block = init_page_buffers(page, bdev,
 982                                                (sector_t)index << sizebits,
 983                                                size);
 984                        goto done;
 985                }
 986                if (!try_to_free_buffers(page))
 987                        goto failed;
 988        }
 989
 990        /*
 991         * Allocate some buffers for this page
 992         */
 993        bh = alloc_page_buffers(page, size, true);
 994
 995        /*
 996         * Link the page to the buffers and initialise them.  Take the
 997         * lock to be atomic wrt __find_get_block(), which does not
 998         * run under the page lock.
 999         */
1000        spin_lock(&inode->i_mapping->private_lock);
1001        link_dev_buffers(page, bh);
1002        end_block = init_page_buffers(page, bdev, (sector_t)index << sizebits,
1003                        size);
1004        spin_unlock(&inode->i_mapping->private_lock);
1005done:
1006        ret = (block < end_block) ? 1 : -ENXIO;
1007failed:
1008        unlock_page(page);
1009        put_page(page);
1010        return ret;
1011}
1012
1013/*
1014 * Create buffers for the specified block device block's page.  If
1015 * that page was dirty, the buffers are set dirty also.
1016 */
1017static int
1018grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp)
1019{
1020        pgoff_t index;
1021        int sizebits;
1022
1023        sizebits = -1;
1024        do {
1025                sizebits++;
1026        } while ((size << sizebits) < PAGE_SIZE);
1027
1028        index = block >> sizebits;
1029
1030        /*
1031         * Check for a block which wants to lie outside our maximum possible
1032         * pagecache index.  (this comparison is done using sector_t types).
1033         */
1034        if (unlikely(index != block >> sizebits)) {
1035                printk(KERN_ERR "%s: requested out-of-range block %llu for "
1036                        "device %pg\n",
1037                        __func__, (unsigned long long)block,
1038                        bdev);
1039                return -EIO;
1040        }
1041
1042        /* Create a page with the proper size buffers.. */
1043        return grow_dev_page(bdev, block, index, size, sizebits, gfp);
1044}
1045
1046static struct buffer_head *
1047__getblk_slow(struct block_device *bdev, sector_t block,
1048             unsigned size, gfp_t gfp)
1049{
1050        /* Size must be multiple of hard sectorsize */
1051        if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
1052                        (size < 512 || size > PAGE_SIZE))) {
1053                printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1054                                        size);
1055                printk(KERN_ERR "logical block size: %d\n",
1056                                        bdev_logical_block_size(bdev));
1057
1058                dump_stack();
1059                return NULL;
1060        }
1061
1062        for (;;) {
1063                struct buffer_head *bh;
1064                int ret;
1065
1066                bh = __find_get_block(bdev, block, size);
1067                if (bh)
1068                        return bh;
1069
1070                ret = grow_buffers(bdev, block, size, gfp);
1071                if (ret < 0)
1072                        return NULL;
1073        }
1074}
1075
1076/*
1077 * The relationship between dirty buffers and dirty pages:
1078 *
1079 * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1080 * the page is tagged dirty in the page cache.
1081 *
1082 * At all times, the dirtiness of the buffers represents the dirtiness of
1083 * subsections of the page.  If the page has buffers, the page dirty bit is
1084 * merely a hint about the true dirty state.
1085 *
1086 * When a page is set dirty in its entirety, all its buffers are marked dirty
1087 * (if the page has buffers).
1088 *
1089 * When a buffer is marked dirty, its page is dirtied, but the page's other
1090 * buffers are not.
1091 *
1092 * Also.  When blockdev buffers are explicitly read with bread(), they
1093 * individually become uptodate.  But their backing page remains not
1094 * uptodate - even if all of its buffers are uptodate.  A subsequent
1095 * block_read_full_page() against that page will discover all the uptodate
1096 * buffers, will set the page uptodate and will perform no I/O.
1097 */
1098
1099/**
1100 * mark_buffer_dirty - mark a buffer_head as needing writeout
1101 * @bh: the buffer_head to mark dirty
1102 *
1103 * mark_buffer_dirty() will set the dirty bit against the buffer, then set
1104 * its backing page dirty, then tag the page as dirty in the page cache
1105 * and then attach the address_space's inode to its superblock's dirty
1106 * inode list.
1107 *
1108 * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
1109 * i_pages lock and mapping->host->i_lock.
1110 */
1111void mark_buffer_dirty(struct buffer_head *bh)
1112{
1113        WARN_ON_ONCE(!buffer_uptodate(bh));
1114
1115        trace_block_dirty_buffer(bh);
1116
1117        /*
1118         * Very *carefully* optimize the it-is-already-dirty case.
1119         *
1120         * Don't let the final "is it dirty" escape to before we
1121         * perhaps modified the buffer.
1122         */
1123        if (buffer_dirty(bh)) {
1124                smp_mb();
1125                if (buffer_dirty(bh))
1126                        return;
1127        }
1128
1129        if (!test_set_buffer_dirty(bh)) {
1130                struct page *page = bh->b_page;
1131                struct address_space *mapping = NULL;
1132
1133                lock_page_memcg(page);
1134                if (!TestSetPageDirty(page)) {
1135                        mapping = page_mapping(page);
1136                        if (mapping)
1137                                __set_page_dirty(page, mapping, 0);
1138                }
1139                unlock_page_memcg(page);
1140                if (mapping)
1141                        __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
1142        }
1143}
1144EXPORT_SYMBOL(mark_buffer_dirty);
1145
1146void mark_buffer_write_io_error(struct buffer_head *bh)
1147{
1148        struct super_block *sb;
1149
1150        set_buffer_write_io_error(bh);
1151        /* FIXME: do we need to set this in both places? */
1152        if (bh->b_page && bh->b_page->mapping)
1153                mapping_set_error(bh->b_page->mapping, -EIO);
1154        if (bh->b_assoc_map)
1155                mapping_set_error(bh->b_assoc_map, -EIO);
1156        rcu_read_lock();
1157        sb = READ_ONCE(bh->b_bdev->bd_super);
1158        if (sb)
1159                errseq_set(&sb->s_wb_err, -EIO);
1160        rcu_read_unlock();
1161}
1162EXPORT_SYMBOL(mark_buffer_write_io_error);
1163
1164/*
1165 * Decrement a buffer_head's reference count.  If all buffers against a page
1166 * have zero reference count, are clean and unlocked, and if the page is clean
1167 * and unlocked then try_to_free_buffers() may strip the buffers from the page
1168 * in preparation for freeing it (sometimes, rarely, buffers are removed from
1169 * a page but it ends up not being freed, and buffers may later be reattached).
1170 */
1171void __brelse(struct buffer_head * buf)
1172{
1173        if (atomic_read(&buf->b_count)) {
1174                put_bh(buf);
1175                return;
1176        }
1177        WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1178}
1179EXPORT_SYMBOL(__brelse);
1180
1181/*
1182 * bforget() is like brelse(), except it discards any
1183 * potentially dirty data.
1184 */
1185void __bforget(struct buffer_head *bh)
1186{
1187        clear_buffer_dirty(bh);
1188        if (bh->b_assoc_map) {
1189                struct address_space *buffer_mapping = bh->b_page->mapping;
1190
1191                spin_lock(&buffer_mapping->private_lock);
1192                list_del_init(&bh->b_assoc_buffers);
1193                bh->b_assoc_map = NULL;
1194                spin_unlock(&buffer_mapping->private_lock);
1195        }
1196        __brelse(bh);
1197}
1198EXPORT_SYMBOL(__bforget);
1199
1200static struct buffer_head *__bread_slow(struct buffer_head *bh)
1201{
1202        lock_buffer(bh);
1203        if (buffer_uptodate(bh)) {
1204                unlock_buffer(bh);
1205                return bh;
1206        } else {
1207                get_bh(bh);
1208                bh->b_end_io = end_buffer_read_sync;
1209                submit_bh(REQ_OP_READ, 0, bh);
1210                wait_on_buffer(bh);
1211                if (buffer_uptodate(bh))
1212                        return bh;
1213        }
1214        brelse(bh);
1215        return NULL;
1216}
1217
1218/*
1219 * Per-cpu buffer LRU implementation.  To reduce the cost of __find_get_block().
1220 * The bhs[] array is sorted - newest buffer is at bhs[0].  Buffers have their
1221 * refcount elevated by one when they're in an LRU.  A buffer can only appear
1222 * once in a particular CPU's LRU.  A single buffer can be present in multiple
1223 * CPU's LRUs at the same time.
1224 *
1225 * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1226 * sb_find_get_block().
1227 *
1228 * The LRUs themselves only need locking against invalidate_bh_lrus.  We use
1229 * a local interrupt disable for that.
1230 */
1231
1232#define BH_LRU_SIZE     16
1233
1234struct bh_lru {
1235        struct buffer_head *bhs[BH_LRU_SIZE];
1236};
1237
1238static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1239
1240#ifdef CONFIG_SMP
1241#define bh_lru_lock()   local_irq_disable()
1242#define bh_lru_unlock() local_irq_enable()
1243#else
1244#define bh_lru_lock()   preempt_disable()
1245#define bh_lru_unlock() preempt_enable()
1246#endif
1247
1248static inline void check_irqs_on(void)
1249{
1250#ifdef irqs_disabled
1251        BUG_ON(irqs_disabled());
1252#endif
1253}
1254
1255/*
1256 * Install a buffer_head into this cpu's LRU.  If not already in the LRU, it is
1257 * inserted at the front, and the buffer_head at the back if any is evicted.
1258 * Or, if already in the LRU it is moved to the front.
1259 */
1260static void bh_lru_install(struct buffer_head *bh)
1261{
1262        struct buffer_head *evictee = bh;
1263        struct bh_lru *b;
1264        int i;
1265
1266        check_irqs_on();
1267        bh_lru_lock();
1268
1269        b = this_cpu_ptr(&bh_lrus);
1270        for (i = 0; i < BH_LRU_SIZE; i++) {
1271                swap(evictee, b->bhs[i]);
1272                if (evictee == bh) {
1273                        bh_lru_unlock();
1274                        return;
1275                }
1276        }
1277
1278        get_bh(bh);
1279        bh_lru_unlock();
1280        brelse(evictee);
1281}
1282
1283/*
1284 * Look up the bh in this cpu's LRU.  If it's there, move it to the head.
1285 */
1286static struct buffer_head *
1287lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
1288{
1289        struct buffer_head *ret = NULL;
1290        unsigned int i;
1291
1292        check_irqs_on();
1293        bh_lru_lock();
1294        for (i = 0; i < BH_LRU_SIZE; i++) {
1295                struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
1296
1297                if (bh && bh->b_blocknr == block && bh->b_bdev == bdev &&
1298                    bh->b_size == size) {
1299                        if (i) {
1300                                while (i) {
1301                                        __this_cpu_write(bh_lrus.bhs[i],
1302                                                __this_cpu_read(bh_lrus.bhs[i - 1]));
1303                                        i--;
1304                                }
1305                                __this_cpu_write(bh_lrus.bhs[0], bh);
1306                        }
1307                        get_bh(bh);
1308                        ret = bh;
1309                        break;
1310                }
1311        }
1312        bh_lru_unlock();
1313        return ret;
1314}
1315
1316/*
1317 * Perform a pagecache lookup for the matching buffer.  If it's there, refresh
1318 * it in the LRU and mark it as accessed.  If it is not present then return
1319 * NULL
1320 */
1321struct buffer_head *
1322__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
1323{
1324        struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1325
1326        if (bh == NULL) {
1327                /* __find_get_block_slow will mark the page accessed */
1328                bh = __find_get_block_slow(bdev, block);
1329                if (bh)
1330                        bh_lru_install(bh);
1331        } else
1332                touch_buffer(bh);
1333
1334        return bh;
1335}
1336EXPORT_SYMBOL(__find_get_block);
1337
1338/*
1339 * __getblk_gfp() will locate (and, if necessary, create) the buffer_head
1340 * which corresponds to the passed block_device, block and size. The
1341 * returned buffer has its reference count incremented.
1342 *
1343 * __getblk_gfp() will lock up the machine if grow_dev_page's
1344 * try_to_free_buffers() attempt is failing.  FIXME, perhaps?
1345 */
1346struct buffer_head *
1347__getblk_gfp(struct block_device *bdev, sector_t block,
1348             unsigned size, gfp_t gfp)
1349{
1350        struct buffer_head *bh = __find_get_block(bdev, block, size);
1351
1352        might_sleep();
1353        if (bh == NULL)
1354                bh = __getblk_slow(bdev, block, size, gfp);
1355        return bh;
1356}
1357EXPORT_SYMBOL(__getblk_gfp);
1358
1359/*
1360 * Do async read-ahead on a buffer..
1361 */
1362void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
1363{
1364        struct buffer_head *bh = __getblk(bdev, block, size);
1365        if (likely(bh)) {
1366                ll_rw_block(REQ_OP_READ, REQ_RAHEAD, 1, &bh);
1367                brelse(bh);
1368        }
1369}
1370EXPORT_SYMBOL(__breadahead);
1371
1372void __breadahead_gfp(struct block_device *bdev, sector_t block, unsigned size,
1373                      gfp_t gfp)
1374{
1375        struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp);
1376        if (likely(bh)) {
1377                ll_rw_block(REQ_OP_READ, REQ_RAHEAD, 1, &bh);
1378                brelse(bh);
1379        }
1380}
1381EXPORT_SYMBOL(__breadahead_gfp);
1382
1383/**
1384 *  __bread_gfp() - reads a specified block and returns the bh
1385 *  @bdev: the block_device to read from
1386 *  @block: number of block
1387 *  @size: size (in bytes) to read
1388 *  @gfp: page allocation flag
1389 *
1390 *  Reads a specified block, and returns buffer head that contains it.
1391 *  The page cache can be allocated from non-movable area
1392 *  not to prevent page migration if you set gfp to zero.
1393 *  It returns NULL if the block was unreadable.
1394 */
1395struct buffer_head *
1396__bread_gfp(struct block_device *bdev, sector_t block,
1397                   unsigned size, gfp_t gfp)
1398{
1399        struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp);
1400
1401        if (likely(bh) && !buffer_uptodate(bh))
1402                bh = __bread_slow(bh);
1403        return bh;
1404}
1405EXPORT_SYMBOL(__bread_gfp);
1406
1407/*
1408 * invalidate_bh_lrus() is called rarely - but not only at unmount.
1409 * This doesn't race because it runs in each cpu either in irq
1410 * or with preempt disabled.
1411 */
1412static void invalidate_bh_lru(void *arg)
1413{
1414        struct bh_lru *b = &get_cpu_var(bh_lrus);
1415        int i;
1416
1417        for (i = 0; i < BH_LRU_SIZE; i++) {
1418                brelse(b->bhs[i]);
1419                b->bhs[i] = NULL;
1420        }
1421        put_cpu_var(bh_lrus);
1422}
1423
1424static bool has_bh_in_lru(int cpu, void *dummy)
1425{
1426        struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu);
1427        int i;
1428        
1429        for (i = 0; i < BH_LRU_SIZE; i++) {
1430                if (b->bhs[i])
1431                        return true;
1432        }
1433
1434        return false;
1435}
1436
1437void invalidate_bh_lrus(void)
1438{
1439        on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1);
1440}
1441EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
1442
1443void set_bh_page(struct buffer_head *bh,
1444                struct page *page, unsigned long offset)
1445{
1446        bh->b_page = page;
1447        BUG_ON(offset >= PAGE_SIZE);
1448        if (PageHighMem(page))
1449                /*
1450                 * This catches illegal uses and preserves the offset:
1451                 */
1452                bh->b_data = (char *)(0 + offset);
1453        else
1454                bh->b_data = page_address(page) + offset;
1455}
1456EXPORT_SYMBOL(set_bh_page);
1457
1458/*
1459 * Called when truncating a buffer on a page completely.
1460 */
1461
1462/* Bits that are cleared during an invalidate */
1463#define BUFFER_FLAGS_DISCARD \
1464        (1 << BH_Mapped | 1 << BH_New | 1 << BH_Req | \
1465         1 << BH_Delay | 1 << BH_Unwritten)
1466
1467static void discard_buffer(struct buffer_head * bh)
1468{
1469        unsigned long b_state, b_state_old;
1470
1471        lock_buffer(bh);
1472        clear_buffer_dirty(bh);
1473        bh->b_bdev = NULL;
1474        b_state = bh->b_state;
1475        for (;;) {
1476                b_state_old = cmpxchg(&bh->b_state, b_state,
1477                                      (b_state & ~BUFFER_FLAGS_DISCARD));
1478                if (b_state_old == b_state)
1479                        break;
1480                b_state = b_state_old;
1481        }
1482        unlock_buffer(bh);
1483}
1484
1485/**
1486 * block_invalidatepage - invalidate part or all of a buffer-backed page
1487 *
1488 * @page: the page which is affected
1489 * @offset: start of the range to invalidate
1490 * @length: length of the range to invalidate
1491 *
1492 * block_invalidatepage() is called when all or part of the page has become
1493 * invalidated by a truncate operation.
1494 *
1495 * block_invalidatepage() does not have to release all buffers, but it must
1496 * ensure that no dirty buffer is left outside @offset and that no I/O
1497 * is underway against any of the blocks which are outside the truncation
1498 * point.  Because the caller is about to free (and possibly reuse) those
1499 * blocks on-disk.
1500 */
1501void block_invalidatepage(struct page *page, unsigned int offset,
1502                          unsigned int length)
1503{
1504        struct buffer_head *head, *bh, *next;
1505        unsigned int curr_off = 0;
1506        unsigned int stop = length + offset;
1507
1508        BUG_ON(!PageLocked(page));
1509        if (!page_has_buffers(page))
1510                goto out;
1511
1512        /*
1513         * Check for overflow
1514         */
1515        BUG_ON(stop > PAGE_SIZE || stop < length);
1516
1517        head = page_buffers(page);
1518        bh = head;
1519        do {
1520                unsigned int next_off = curr_off + bh->b_size;
1521                next = bh->b_this_page;
1522
1523                /*
1524                 * Are we still fully in range ?
1525                 */
1526                if (next_off > stop)
1527                        goto out;
1528
1529                /*
1530                 * is this block fully invalidated?
1531                 */
1532                if (offset <= curr_off)
1533                        discard_buffer(bh);
1534                curr_off = next_off;
1535                bh = next;
1536        } while (bh != head);
1537
1538        /*
1539         * We release buffers only if the entire page is being invalidated.
1540         * The get_block cached value has been unconditionally invalidated,
1541         * so real IO is not possible anymore.
1542         */
1543        if (length == PAGE_SIZE)
1544                try_to_release_page(page, 0);
1545out:
1546        return;
1547}
1548EXPORT_SYMBOL(block_invalidatepage);
1549
1550
1551/*
1552 * We attach and possibly dirty the buffers atomically wrt
1553 * __set_page_dirty_buffers() via private_lock.  try_to_free_buffers
1554 * is already excluded via the page lock.
1555 */
1556void create_empty_buffers(struct page *page,
1557                        unsigned long blocksize, unsigned long b_state)
1558{
1559        struct buffer_head *bh, *head, *tail;
1560
1561        head = alloc_page_buffers(page, blocksize, true);
1562        bh = head;
1563        do {
1564                bh->b_state |= b_state;
1565                tail = bh;
1566                bh = bh->b_this_page;
1567        } while (bh);
1568        tail->b_this_page = head;
1569
1570        spin_lock(&page->mapping->private_lock);
1571        if (PageUptodate(page) || PageDirty(page)) {
1572                bh = head;
1573                do {
1574                        if (PageDirty(page))
1575                                set_buffer_dirty(bh);
1576                        if (PageUptodate(page))
1577                                set_buffer_uptodate(bh);
1578                        bh = bh->b_this_page;
1579                } while (bh != head);
1580        }
1581        attach_page_private(page, head);
1582        spin_unlock(&page->mapping->private_lock);
1583}
1584EXPORT_SYMBOL(create_empty_buffers);
1585
1586/**
1587 * clean_bdev_aliases: clean a range of buffers in block device
1588 * @bdev: Block device to clean buffers in
1589 * @block: Start of a range of blocks to clean
1590 * @len: Number of blocks to clean
1591 *
1592 * We are taking a range of blocks for data and we don't want writeback of any
1593 * buffer-cache aliases starting from return from this function and until the
1594 * moment when something will explicitly mark the buffer dirty (hopefully that
1595 * will not happen until we will free that block ;-) We don't even need to mark
1596 * it not-uptodate - nobody can expect anything from a newly allocated buffer
1597 * anyway. We used to use unmap_buffer() for such invalidation, but that was
1598 * wrong. We definitely don't want to mark the alias unmapped, for example - it
1599 * would confuse anyone who might pick it with bread() afterwards...
1600 *
1601 * Also..  Note that bforget() doesn't lock the buffer.  So there can be
1602 * writeout I/O going on against recently-freed buffers.  We don't wait on that
1603 * I/O in bforget() - it's more efficient to wait on the I/O only if we really
1604 * need to.  That happens here.
1605 */
1606void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
1607{
1608        struct inode *bd_inode = bdev->bd_inode;
1609        struct address_space *bd_mapping = bd_inode->i_mapping;
1610        struct pagevec pvec;
1611        pgoff_t index = block >> (PAGE_SHIFT - bd_inode->i_blkbits);
1612        pgoff_t end;
1613        int i, count;
1614        struct buffer_head *bh;
1615        struct buffer_head *head;
1616
1617        end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits);
1618        pagevec_init(&pvec);
1619        while (pagevec_lookup_range(&pvec, bd_mapping, &index, end)) {
1620                count = pagevec_count(&pvec);
1621                for (i = 0; i < count; i++) {
1622                        struct page *page = pvec.pages[i];
1623
1624                        if (!page_has_buffers(page))
1625                                continue;
1626                        /*
1627                         * We use page lock instead of bd_mapping->private_lock
1628                         * to pin buffers here since we can afford to sleep and
1629                         * it scales better than a global spinlock lock.
1630                         */
1631                        lock_page(page);
1632                        /* Recheck when the page is locked which pins bhs */
1633                        if (!page_has_buffers(page))
1634                                goto unlock_page;
1635                        head = page_buffers(page);
1636                        bh = head;
1637                        do {
1638                                if (!buffer_mapped(bh) || (bh->b_blocknr < block))
1639                                        goto next;
1640                                if (bh->b_blocknr >= block + len)
1641                                        break;
1642                                clear_buffer_dirty(bh);
1643                                wait_on_buffer(bh);
1644                                clear_buffer_req(bh);
1645next:
1646                                bh = bh->b_this_page;
1647                        } while (bh != head);
1648unlock_page:
1649                        unlock_page(page);
1650                }
1651                pagevec_release(&pvec);
1652                cond_resched();
1653                /* End of range already reached? */
1654                if (index > end || !index)
1655                        break;
1656        }
1657}
1658EXPORT_SYMBOL(clean_bdev_aliases);
1659
1660/*
1661 * Size is a power-of-two in the range 512..PAGE_SIZE,
1662 * and the case we care about most is PAGE_SIZE.
1663 *
1664 * So this *could* possibly be written with those
1665 * constraints in mind (relevant mostly if some
1666 * architecture has a slow bit-scan instruction)
1667 */
1668static inline int block_size_bits(unsigned int blocksize)
1669{
1670        return ilog2(blocksize);
1671}
1672
1673static struct buffer_head *create_page_buffers(struct page *page, struct inode *inode, unsigned int b_state)
1674{
1675        BUG_ON(!PageLocked(page));
1676
1677        if (!page_has_buffers(page))
1678                create_empty_buffers(page, 1 << READ_ONCE(inode->i_blkbits),
1679                                     b_state);
1680        return page_buffers(page);
1681}
1682
1683/*
1684 * NOTE! All mapped/uptodate combinations are valid:
1685 *
1686 *      Mapped  Uptodate        Meaning
1687 *
1688 *      No      No              "unknown" - must do get_block()
1689 *      No      Yes             "hole" - zero-filled
1690 *      Yes     No              "allocated" - allocated on disk, not read in
1691 *      Yes     Yes             "valid" - allocated and up-to-date in memory.
1692 *
1693 * "Dirty" is valid only with the last case (mapped+uptodate).
1694 */
1695
1696/*
1697 * While block_write_full_page is writing back the dirty buffers under
1698 * the page lock, whoever dirtied the buffers may decide to clean them
1699 * again at any time.  We handle that by only looking at the buffer
1700 * state inside lock_buffer().
1701 *
1702 * If block_write_full_page() is called for regular writeback
1703 * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1704 * locked buffer.   This only can happen if someone has written the buffer
1705 * directly, with submit_bh().  At the address_space level PageWriteback
1706 * prevents this contention from occurring.
1707 *
1708 * If block_write_full_page() is called with wbc->sync_mode ==
1709 * WB_SYNC_ALL, the writes are posted using REQ_SYNC; this
1710 * causes the writes to be flagged as synchronous writes.
1711 */
1712int __block_write_full_page(struct inode *inode, struct page *page,
1713                        get_block_t *get_block, struct writeback_control *wbc,
1714                        bh_end_io_t *handler)
1715{
1716        int err;
1717        sector_t block;
1718        sector_t last_block;
1719        struct buffer_head *bh, *head;
1720        unsigned int blocksize, bbits;
1721        int nr_underway = 0;
1722        int write_flags = wbc_to_write_flags(wbc);
1723
1724        head = create_page_buffers(page, inode,
1725                                        (1 << BH_Dirty)|(1 << BH_Uptodate));
1726
1727        /*
1728         * Be very careful.  We have no exclusion from __set_page_dirty_buffers
1729         * here, and the (potentially unmapped) buffers may become dirty at
1730         * any time.  If a buffer becomes dirty here after we've inspected it
1731         * then we just miss that fact, and the page stays dirty.
1732         *
1733         * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1734         * handle that here by just cleaning them.
1735         */
1736
1737        bh = head;
1738        blocksize = bh->b_size;
1739        bbits = block_size_bits(blocksize);
1740
1741        block = (sector_t)page->index << (PAGE_SHIFT - bbits);
1742        last_block = (i_size_read(inode) - 1) >> bbits;
1743
1744        /*
1745         * Get all the dirty buffers mapped to disk addresses and
1746         * handle any aliases from the underlying blockdev's mapping.
1747         */
1748        do {
1749                if (block > last_block) {
1750                        /*
1751                         * mapped buffers outside i_size will occur, because
1752                         * this page can be outside i_size when there is a
1753                         * truncate in progress.
1754                         */
1755                        /*
1756                         * The buffer was zeroed by block_write_full_page()
1757                         */
1758                        clear_buffer_dirty(bh);
1759                        set_buffer_uptodate(bh);
1760                } else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
1761                           buffer_dirty(bh)) {
1762                        WARN_ON(bh->b_size != blocksize);
1763                        err = get_block(inode, block, bh, 1);
1764                        if (err)
1765                                goto recover;
1766                        clear_buffer_delay(bh);
1767                        if (buffer_new(bh)) {
1768                                /* blockdev mappings never come here */
1769                                clear_buffer_new(bh);
1770                                clean_bdev_bh_alias(bh);
1771                        }
1772                }
1773                bh = bh->b_this_page;
1774                block++;
1775        } while (bh != head);
1776
1777        do {
1778                if (!buffer_mapped(bh))
1779                        continue;
1780                /*
1781                 * If it's a fully non-blocking write attempt and we cannot
1782                 * lock the buffer then redirty the page.  Note that this can
1783                 * potentially cause a busy-wait loop from writeback threads
1784                 * and kswapd activity, but those code paths have their own
1785                 * higher-level throttling.
1786                 */
1787                if (wbc->sync_mode != WB_SYNC_NONE) {
1788                        lock_buffer(bh);
1789                } else if (!trylock_buffer(bh)) {
1790                        redirty_page_for_writepage(wbc, page);
1791                        continue;
1792                }
1793                if (test_clear_buffer_dirty(bh)) {
1794                        mark_buffer_async_write_endio(bh, handler);
1795                } else {
1796                        unlock_buffer(bh);
1797                }
1798        } while ((bh = bh->b_this_page) != head);
1799
1800        /*
1801         * The page and its buffers are protected by PageWriteback(), so we can
1802         * drop the bh refcounts early.
1803         */
1804        BUG_ON(PageWriteback(page));
1805        set_page_writeback(page);
1806
1807        do {
1808                struct buffer_head *next = bh->b_this_page;
1809                if (buffer_async_write(bh)) {
1810                        submit_bh_wbc(REQ_OP_WRITE, write_flags, bh,
1811                                        inode->i_write_hint, wbc);
1812                        nr_underway++;
1813                }
1814                bh = next;
1815        } while (bh != head);
1816        unlock_page(page);
1817
1818        err = 0;
1819done:
1820        if (nr_underway == 0) {
1821                /*
1822                 * The page was marked dirty, but the buffers were
1823                 * clean.  Someone wrote them back by hand with
1824                 * ll_rw_block/submit_bh.  A rare case.
1825                 */
1826                end_page_writeback(page);
1827
1828                /*
1829                 * The page and buffer_heads can be released at any time from
1830                 * here on.
1831                 */
1832        }
1833        return err;
1834
1835recover:
1836        /*
1837         * ENOSPC, or some other error.  We may already have added some
1838         * blocks to the file, so we need to write these out to avoid
1839         * exposing stale data.
1840         * The page is currently locked and not marked for writeback
1841         */
1842        bh = head;
1843        /* Recovery: lock and submit the mapped buffers */
1844        do {
1845                if (buffer_mapped(bh) && buffer_dirty(bh) &&
1846                    !buffer_delay(bh)) {
1847                        lock_buffer(bh);
1848                        mark_buffer_async_write_endio(bh, handler);
1849                } else {
1850                        /*
1851                         * The buffer may have been set dirty during
1852                         * attachment to a dirty page.
1853                         */
1854                        clear_buffer_dirty(bh);
1855                }
1856        } while ((bh = bh->b_this_page) != head);
1857        SetPageError(page);
1858        BUG_ON(PageWriteback(page));
1859        mapping_set_error(page->mapping, err);
1860        set_page_writeback(page);
1861        do {
1862                struct buffer_head *next = bh->b_this_page;
1863                if (buffer_async_write(bh)) {
1864                        clear_buffer_dirty(bh);
1865                        submit_bh_wbc(REQ_OP_WRITE, write_flags, bh,
1866                                        inode->i_write_hint, wbc);
1867                        nr_underway++;
1868                }
1869                bh = next;
1870        } while (bh != head);
1871        unlock_page(page);
1872        goto done;
1873}
1874EXPORT_SYMBOL(__block_write_full_page);
1875
1876/*
1877 * If a page has any new buffers, zero them out here, and mark them uptodate
1878 * and dirty so they'll be written out (in order to prevent uninitialised
1879 * block data from leaking). And clear the new bit.
1880 */
1881void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
1882{
1883        unsigned int block_start, block_end;
1884        struct buffer_head *head, *bh;
1885
1886        BUG_ON(!PageLocked(page));
1887        if (!page_has_buffers(page))
1888                return;
1889
1890        bh = head = page_buffers(page);
1891        block_start = 0;
1892        do {
1893                block_end = block_start + bh->b_size;
1894
1895                if (buffer_new(bh)) {
1896                        if (block_end > from && block_start < to) {
1897                                if (!PageUptodate(page)) {
1898                                        unsigned start, size;
1899
1900                                        start = max(from, block_start);
1901                                        size = min(to, block_end) - start;
1902
1903                                        zero_user(page, start, size);
1904                                        set_buffer_uptodate(bh);
1905                                }
1906
1907                                clear_buffer_new(bh);
1908                                mark_buffer_dirty(bh);
1909                        }
1910                }
1911
1912                block_start = block_end;
1913                bh = bh->b_this_page;
1914        } while (bh != head);
1915}
1916EXPORT_SYMBOL(page_zero_new_buffers);
1917
1918static void
1919iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
1920                struct iomap *iomap)
1921{
1922        loff_t offset = block << inode->i_blkbits;
1923
1924        bh->b_bdev = iomap->bdev;
1925
1926        /*
1927         * Block points to offset in file we need to map, iomap contains
1928         * the offset at which the map starts. If the map ends before the
1929         * current block, then do not map the buffer and let the caller
1930         * handle it.
1931         */
1932        BUG_ON(offset >= iomap->offset + iomap->length);
1933
1934        switch (iomap->type) {
1935        case IOMAP_HOLE:
1936                /*
1937                 * If the buffer is not up to date or beyond the current EOF,
1938                 * we need to mark it as new to ensure sub-block zeroing is
1939                 * executed if necessary.
1940                 */
1941                if (!buffer_uptodate(bh) ||
1942                    (offset >= i_size_read(inode)))
1943                        set_buffer_new(bh);
1944                break;
1945        case IOMAP_DELALLOC:
1946                if (!buffer_uptodate(bh) ||
1947                    (offset >= i_size_read(inode)))
1948                        set_buffer_new(bh);
1949                set_buffer_uptodate(bh);
1950                set_buffer_mapped(bh);
1951                set_buffer_delay(bh);
1952                break;
1953        case IOMAP_UNWRITTEN:
1954                /*
1955                 * For unwritten regions, we always need to ensure that regions
1956                 * in the block we are not writing to are zeroed. Mark the
1957                 * buffer as new to ensure this.
1958                 */
1959                set_buffer_new(bh);
1960                set_buffer_unwritten(bh);
1961                fallthrough;
1962        case IOMAP_MAPPED:
1963                if ((iomap->flags & IOMAP_F_NEW) ||
1964                    offset >= i_size_read(inode))
1965                        set_buffer_new(bh);
1966                bh->b_blocknr = (iomap->addr + offset - iomap->offset) >>
1967                                inode->i_blkbits;
1968                set_buffer_mapped(bh);
1969                break;
1970        }
1971}
1972
1973int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
1974                get_block_t *get_block, struct iomap *iomap)
1975{
1976        unsigned from = pos & (PAGE_SIZE - 1);
1977        unsigned to = from + len;
1978        struct inode *inode = page->mapping->host;
1979        unsigned block_start, block_end;
1980        sector_t block;
1981        int err = 0;
1982        unsigned blocksize, bbits;
1983        struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1984
1985        BUG_ON(!PageLocked(page));
1986        BUG_ON(from > PAGE_SIZE);
1987        BUG_ON(to > PAGE_SIZE);
1988        BUG_ON(from > to);
1989
1990        head = create_page_buffers(page, inode, 0);
1991        blocksize = head->b_size;
1992        bbits = block_size_bits(blocksize);
1993
1994        block = (sector_t)page->index << (PAGE_SHIFT - bbits);
1995
1996        for(bh = head, block_start = 0; bh != head || !block_start;
1997            block++, block_start=block_end, bh = bh->b_this_page) {
1998                block_end = block_start + blocksize;
1999                if (block_end <= from || block_start >= to) {
2000                        if (PageUptodate(page)) {
2001                                if (!buffer_uptodate(bh))
2002                                        set_buffer_uptodate(bh);
2003                        }
2004                        continue;
2005                }
2006                if (buffer_new(bh))
2007                        clear_buffer_new(bh);
2008                if (!buffer_mapped(bh)) {
2009                        WARN_ON(bh->b_size != blocksize);
2010                        if (get_block) {
2011                                err = get_block(inode, block, bh, 1);
2012                                if (err)
2013                                        break;
2014                        } else {
2015                                iomap_to_bh(inode, block, bh, iomap);
2016                        }
2017
2018                        if (buffer_new(bh)) {
2019                                clean_bdev_bh_alias(bh);
2020                                if (PageUptodate(page)) {
2021                                        clear_buffer_new(bh);
2022                                        set_buffer_uptodate(bh);
2023                                        mark_buffer_dirty(bh);
2024                                        continue;
2025                                }
2026                                if (block_end > to || block_start < from)
2027                                        zero_user_segments(page,
2028                                                to, block_end,
2029                                                block_start, from);
2030                                continue;
2031                        }
2032                }
2033                if (PageUptodate(page)) {
2034                        if (!buffer_uptodate(bh))
2035                                set_buffer_uptodate(bh);
2036                        continue; 
2037                }
2038                if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
2039                    !buffer_unwritten(bh) &&
2040                     (block_start < from || block_end > to)) {
2041                        ll_rw_block(REQ_OP_READ, 0, 1, &bh);
2042                        *wait_bh++=bh;
2043                }
2044        }
2045        /*
2046         * If we issued read requests - let them complete.
2047         */
2048        while(wait_bh > wait) {
2049                wait_on_buffer(*--wait_bh);
2050                if (!buffer_uptodate(*wait_bh))
2051                        err = -EIO;
2052        }
2053        if (unlikely(err))
2054                page_zero_new_buffers(page, from, to);
2055        return err;
2056}
2057
2058int __block_write_begin(struct page *page, loff_t pos, unsigned len,
2059                get_block_t *get_block)
2060{
2061        return __block_write_begin_int(page, pos, len, get_block, NULL);
2062}
2063EXPORT_SYMBOL(__block_write_begin);
2064
2065static int __block_commit_write(struct inode *inode, struct page *page,
2066                unsigned from, unsigned to)
2067{
2068        unsigned block_start, block_end;
2069        int partial = 0;
2070        unsigned blocksize;
2071        struct buffer_head *bh, *head;
2072
2073        bh = head = page_buffers(page);
2074        blocksize = bh->b_size;
2075
2076        block_start = 0;
2077        do {
2078                block_end = block_start + blocksize;
2079                if (block_end <= from || block_start >= to) {
2080                        if (!buffer_uptodate(bh))
2081                                partial = 1;
2082                } else {
2083                        set_buffer_uptodate(bh);
2084                        mark_buffer_dirty(bh);
2085                }
2086                clear_buffer_new(bh);
2087
2088                block_start = block_end;
2089                bh = bh->b_this_page;
2090        } while (bh != head);
2091
2092        /*
2093         * If this is a partial write which happened to make all buffers
2094         * uptodate then we can optimize away a bogus readpage() for
2095         * the next read(). Here we 'discover' whether the page went
2096         * uptodate as a result of this (potentially partial) write.
2097         */
2098        if (!partial)
2099                SetPageUptodate(page);
2100        return 0;
2101}
2102
2103/*
2104 * block_write_begin takes care of the basic task of block allocation and
2105 * bringing partial write blocks uptodate first.
2106 *
2107 * The filesystem needs to handle block truncation upon failure.
2108 */
2109int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
2110                unsigned flags, struct page **pagep, get_block_t *get_block)
2111{
2112        pgoff_t index = pos >> PAGE_SHIFT;
2113        struct page *page;
2114        int status;
2115
2116        page = grab_cache_page_write_begin(mapping, index, flags);
2117        if (!page)
2118                return -ENOMEM;
2119
2120        status = __block_write_begin(page, pos, len, get_block);
2121        if (unlikely(status)) {
2122                unlock_page(page);
2123                put_page(page);
2124                page = NULL;
2125        }
2126
2127        *pagep = page;
2128        return status;
2129}
2130EXPORT_SYMBOL(block_write_begin);
2131
2132int block_write_end(struct file *file, struct address_space *mapping,
2133                        loff_t pos, unsigned len, unsigned copied,
2134                        struct page *page, void *fsdata)
2135{
2136        struct inode *inode = mapping->host;
2137        unsigned start;
2138
2139        start = pos & (PAGE_SIZE - 1);
2140
2141        if (unlikely(copied < len)) {
2142                /*
2143                 * The buffers that were written will now be uptodate, so we
2144                 * don't have to worry about a readpage reading them and
2145                 * overwriting a partial write. However if we have encountered
2146                 * a short write and only partially written into a buffer, it
2147                 * will not be marked uptodate, so a readpage might come in and
2148                 * destroy our partial write.
2149                 *
2150                 * Do the simplest thing, and just treat any short write to a
2151                 * non uptodate page as a zero-length write, and force the
2152                 * caller to redo the whole thing.
2153                 */
2154                if (!PageUptodate(page))
2155                        copied = 0;
2156
2157                page_zero_new_buffers(page, start+copied, start+len);
2158        }
2159        flush_dcache_page(page);
2160
2161        /* This could be a short (even 0-length) commit */
2162        __block_commit_write(inode, page, start, start+copied);
2163
2164        return copied;
2165}
2166EXPORT_SYMBOL(block_write_end);
2167
2168int generic_write_end(struct file *file, struct address_space *mapping,
2169                        loff_t pos, unsigned len, unsigned copied,
2170                        struct page *page, void *fsdata)
2171{
2172        struct inode *inode = mapping->host;
2173        loff_t old_size = inode->i_size;
2174        bool i_size_changed = false;
2175
2176        copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
2177
2178        /*
2179         * No need to use i_size_read() here, the i_size cannot change under us
2180         * because we hold i_rwsem.
2181         *
2182         * But it's important to update i_size while still holding page lock:
2183         * page writeout could otherwise come in and zero beyond i_size.
2184         */
2185        if (pos + copied > inode->i_size) {
2186                i_size_write(inode, pos + copied);
2187                i_size_changed = true;
2188        }
2189
2190        unlock_page(page);
2191        put_page(page);
2192
2193        if (old_size < pos)
2194                pagecache_isize_extended(inode, old_size, pos);
2195        /*
2196         * Don't mark the inode dirty under page lock. First, it unnecessarily
2197         * makes the holding time of page lock longer. Second, it forces lock
2198         * ordering of page lock and transaction start for journaling
2199         * filesystems.
2200         */
2201        if (i_size_changed)
2202                mark_inode_dirty(inode);
2203        return copied;
2204}
2205EXPORT_SYMBOL(generic_write_end);
2206
2207/*
2208 * block_is_partially_uptodate checks whether buffers within a page are
2209 * uptodate or not.
2210 *
2211 * Returns true if all buffers which correspond to a file portion
2212 * we want to read are uptodate.
2213 */
2214int block_is_partially_uptodate(struct page *page, unsigned long from,
2215                                        unsigned long count)
2216{
2217        unsigned block_start, block_end, blocksize;
2218        unsigned to;
2219        struct buffer_head *bh, *head;
2220        int ret = 1;
2221
2222        if (!page_has_buffers(page))
2223                return 0;
2224
2225        head = page_buffers(page);
2226        blocksize = head->b_size;
2227        to = min_t(unsigned, PAGE_SIZE - from, count);
2228        to = from + to;
2229        if (from < blocksize && to > PAGE_SIZE - blocksize)
2230                return 0;
2231
2232        bh = head;
2233        block_start = 0;
2234        do {
2235                block_end = block_start + blocksize;
2236                if (block_end > from && block_start < to) {
2237                        if (!buffer_uptodate(bh)) {
2238                                ret = 0;
2239                                break;
2240                        }
2241                        if (block_end >= to)
2242                                break;
2243                }
2244                block_start = block_end;
2245                bh = bh->b_this_page;
2246        } while (bh != head);
2247
2248        return ret;
2249}
2250EXPORT_SYMBOL(block_is_partially_uptodate);
2251
2252/*
2253 * Generic "read page" function for block devices that have the normal
2254 * get_block functionality. This is most of the block device filesystems.
2255 * Reads the page asynchronously --- the unlock_buffer() and
2256 * set/clear_buffer_uptodate() functions propagate buffer state into the
2257 * page struct once IO has completed.
2258 */
2259int block_read_full_page(struct page *page, get_block_t *get_block)
2260{
2261        struct inode *inode = page->mapping->host;
2262        sector_t iblock, lblock;
2263        struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
2264        unsigned int blocksize, bbits;
2265        int nr, i;
2266        int fully_mapped = 1;
2267
2268        head = create_page_buffers(page, inode, 0);
2269        blocksize = head->b_size;
2270        bbits = block_size_bits(blocksize);
2271
2272        iblock = (sector_t)page->index << (PAGE_SHIFT - bbits);
2273        lblock = (i_size_read(inode)+blocksize-1) >> bbits;
2274        bh = head;
2275        nr = 0;
2276        i = 0;
2277
2278        do {
2279                if (buffer_uptodate(bh))
2280                        continue;
2281
2282                if (!buffer_mapped(bh)) {
2283                        int err = 0;
2284
2285                        fully_mapped = 0;
2286                        if (iblock < lblock) {
2287                                WARN_ON(bh->b_size != blocksize);
2288                                err = get_block(inode, iblock, bh, 0);
2289                                if (err)
2290                                        SetPageError(page);
2291                        }
2292                        if (!buffer_mapped(bh)) {
2293                                zero_user(page, i * blocksize, blocksize);
2294                                if (!err)
2295                                        set_buffer_uptodate(bh);
2296                                continue;
2297                        }
2298                        /*
2299                         * get_block() might have updated the buffer
2300                         * synchronously
2301                         */
2302                        if (buffer_uptodate(bh))
2303                                continue;
2304                }
2305                arr[nr++] = bh;
2306        } while (i++, iblock++, (bh = bh->b_this_page) != head);
2307
2308        if (fully_mapped)
2309                SetPageMappedToDisk(page);
2310
2311        if (!nr) {
2312                /*
2313                 * All buffers are uptodate - we can set the page uptodate
2314                 * as well. But not if get_block() returned an error.
2315                 */
2316                if (!PageError(page))
2317                        SetPageUptodate(page);
2318                unlock_page(page);
2319                return 0;
2320        }
2321
2322        /* Stage two: lock the buffers */
2323        for (i = 0; i < nr; i++) {
2324                bh = arr[i];
2325                lock_buffer(bh);
2326                mark_buffer_async_read(bh);
2327        }
2328
2329        /*
2330         * Stage 3: start the IO.  Check for uptodateness
2331         * inside the buffer lock in case another process reading
2332         * the underlying blockdev brought it uptodate (the sct fix).
2333         */
2334        for (i = 0; i < nr; i++) {
2335                bh = arr[i];
2336                if (buffer_uptodate(bh))
2337                        end_buffer_async_read(bh, 1);
2338                else
2339                        submit_bh(REQ_OP_READ, 0, bh);
2340        }
2341        return 0;
2342}
2343EXPORT_SYMBOL(block_read_full_page);
2344
2345/* utility function for filesystems that need to do work on expanding
2346 * truncates.  Uses filesystem pagecache writes to allow the filesystem to
2347 * deal with the hole.  
2348 */
2349int generic_cont_expand_simple(struct inode *inode, loff_t size)
2350{
2351        struct address_space *mapping = inode->i_mapping;
2352        struct page *page;
2353        void *fsdata;
2354        int err;
2355
2356        err = inode_newsize_ok(inode, size);
2357        if (err)
2358                goto out;
2359
2360        err = pagecache_write_begin(NULL, mapping, size, 0,
2361                                    AOP_FLAG_CONT_EXPAND, &page, &fsdata);
2362        if (err)
2363                goto out;
2364
2365        err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata);
2366        BUG_ON(err > 0);
2367
2368out:
2369        return err;
2370}
2371EXPORT_SYMBOL(generic_cont_expand_simple);
2372
2373static int cont_expand_zero(struct file *file, struct address_space *mapping,
2374                            loff_t pos, loff_t *bytes)
2375{
2376        struct inode *inode = mapping->host;
2377        unsigned int blocksize = i_blocksize(inode);
2378        struct page *page;
2379        void *fsdata;
2380        pgoff_t index, curidx;
2381        loff_t curpos;
2382        unsigned zerofrom, offset, len;
2383        int err = 0;
2384
2385        index = pos >> PAGE_SHIFT;
2386        offset = pos & ~PAGE_MASK;
2387
2388        while (index > (curidx = (curpos = *bytes)>>PAGE_SHIFT)) {
2389                zerofrom = curpos & ~PAGE_MASK;
2390                if (zerofrom & (blocksize-1)) {
2391                        *bytes |= (blocksize-1);
2392                        (*bytes)++;
2393                }
2394                len = PAGE_SIZE - zerofrom;
2395
2396                err = pagecache_write_begin(file, mapping, curpos, len, 0,
2397                                            &page, &fsdata);
2398                if (err)
2399                        goto out;
2400                zero_user(page, zerofrom, len);
2401                err = pagecache_write_end(file, mapping, curpos, len, len,
2402                                                page, fsdata);
2403                if (err < 0)
2404                        goto out;
2405                BUG_ON(err != len);
2406                err = 0;
2407
2408                balance_dirty_pages_ratelimited(mapping);
2409
2410                if (fatal_signal_pending(current)) {
2411                        err = -EINTR;
2412                        goto out;
2413                }
2414        }
2415
2416        /* page covers the boundary, find the boundary offset */
2417        if (index == curidx) {
2418                zerofrom = curpos & ~PAGE_MASK;
2419                /* if we will expand the thing last block will be filled */
2420                if (offset <= zerofrom) {
2421                        goto out;
2422                }
2423                if (zerofrom & (blocksize-1)) {
2424                        *bytes |= (blocksize-1);
2425                        (*bytes)++;
2426                }
2427                len = offset - zerofrom;
2428
2429                err = pagecache_write_begin(file, mapping, curpos, len, 0,
2430                                            &page, &fsdata);
2431                if (err)
2432                        goto out;
2433                zero_user(page, zerofrom, len);
2434                err = pagecache_write_end(file, mapping, curpos, len, len,
2435                                                page, fsdata);
2436                if (err < 0)
2437                        goto out;
2438                BUG_ON(err != len);
2439                err = 0;
2440        }
2441out:
2442        return err;
2443}
2444
2445/*
2446 * For moronic filesystems that do not allow holes in file.
2447 * We may have to extend the file.
2448 */
2449int cont_write_begin(struct file *file, struct address_space *mapping,
2450                        loff_t pos, unsigned len, unsigned flags,
2451                        struct page **pagep, void **fsdata,
2452                        get_block_t *get_block, loff_t *bytes)
2453{
2454        struct inode *inode = mapping->host;
2455        unsigned int blocksize = i_blocksize(inode);
2456        unsigned int zerofrom;
2457        int err;
2458
2459        err = cont_expand_zero(file, mapping, pos, bytes);
2460        if (err)
2461                return err;
2462
2463        zerofrom = *bytes & ~PAGE_MASK;
2464        if (pos+len > *bytes && zerofrom & (blocksize-1)) {
2465                *bytes |= (blocksize-1);
2466                (*bytes)++;
2467        }
2468
2469        return block_write_begin(mapping, pos, len, flags, pagep, get_block);
2470}
2471EXPORT_SYMBOL(cont_write_begin);
2472
2473int block_commit_write(struct page *page, unsigned from, unsigned to)
2474{
2475        struct inode *inode = page->mapping->host;
2476        __block_commit_write(inode,page,from,to);
2477        return 0;
2478}
2479EXPORT_SYMBOL(block_commit_write);
2480
2481/*
2482 * block_page_mkwrite() is not allowed to change the file size as it gets
2483 * called from a page fault handler when a page is first dirtied. Hence we must
2484 * be careful to check for EOF conditions here. We set the page up correctly
2485 * for a written page which means we get ENOSPC checking when writing into
2486 * holes and correct delalloc and unwritten extent mapping on filesystems that
2487 * support these features.
2488 *
2489 * We are not allowed to take the i_mutex here so we have to play games to
2490 * protect against truncate races as the page could now be beyond EOF.  Because
2491 * truncate writes the inode size before removing pages, once we have the
2492 * page lock we can determine safely if the page is beyond EOF. If it is not
2493 * beyond EOF, then the page is guaranteed safe against truncation until we
2494 * unlock the page.
2495 *
2496 * Direct callers of this function should protect against filesystem freezing
2497 * using sb_start_pagefault() - sb_end_pagefault() functions.
2498 */
2499int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2500                         get_block_t get_block)
2501{
2502        struct page *page = vmf->page;
2503        struct inode *inode = file_inode(vma->vm_file);
2504        unsigned long end;
2505        loff_t size;
2506        int ret;
2507
2508        lock_page(page);
2509        size = i_size_read(inode);
2510        if ((page->mapping != inode->i_mapping) ||
2511            (page_offset(page) > size)) {
2512                /* We overload EFAULT to mean page got truncated */
2513                ret = -EFAULT;
2514                goto out_unlock;
2515        }
2516
2517        /* page is wholly or partially inside EOF */
2518        if (((page->index + 1) << PAGE_SHIFT) > size)
2519                end = size & ~PAGE_MASK;
2520        else
2521                end = PAGE_SIZE;
2522
2523        ret = __block_write_begin(page, 0, end, get_block);
2524        if (!ret)
2525                ret = block_commit_write(page, 0, end);
2526
2527        if (unlikely(ret < 0))
2528                goto out_unlock;
2529        set_page_dirty(page);
2530        wait_for_stable_page(page);
2531        return 0;
2532out_unlock:
2533        unlock_page(page);
2534        return ret;
2535}
2536EXPORT_SYMBOL(block_page_mkwrite);
2537
2538/*
2539 * nobh_write_begin()'s prereads are special: the buffer_heads are freed
2540 * immediately, while under the page lock.  So it needs a special end_io
2541 * handler which does not touch the bh after unlocking it.
2542 */
2543static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
2544{
2545        __end_buffer_read_notouch(bh, uptodate);
2546}
2547
2548/*
2549 * Attach the singly-linked list of buffers created by nobh_write_begin, to
2550 * the page (converting it to circular linked list and taking care of page
2551 * dirty races).
2552 */
2553static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
2554{
2555        struct buffer_head *bh;
2556
2557        BUG_ON(!PageLocked(page));
2558
2559        spin_lock(&page->mapping->private_lock);
2560        bh = head;
2561        do {
2562                if (PageDirty(page))
2563                        set_buffer_dirty(bh);
2564                if (!bh->b_this_page)
2565                        bh->b_this_page = head;
2566                bh = bh->b_this_page;
2567        } while (bh != head);
2568        attach_page_private(page, head);
2569        spin_unlock(&page->mapping->private_lock);
2570}
2571
2572/*
2573 * On entry, the page is fully not uptodate.
2574 * On exit the page is fully uptodate in the areas outside (from,to)
2575 * The filesystem needs to handle block truncation upon failure.
2576 */
2577int nobh_write_begin(struct address_space *mapping,
2578                        loff_t pos, unsigned len, unsigned flags,
2579                        struct page **pagep, void **fsdata,
2580                        get_block_t *get_block)
2581{
2582        struct inode *inode = mapping->host;
2583        const unsigned blkbits = inode->i_blkbits;
2584        const unsigned blocksize = 1 << blkbits;
2585        struct buffer_head *head, *bh;
2586        struct page *page;
2587        pgoff_t index;
2588        unsigned from, to;
2589        unsigned block_in_page;
2590        unsigned block_start, block_end;
2591        sector_t block_in_file;
2592        int nr_reads = 0;
2593        int ret = 0;
2594        int is_mapped_to_disk = 1;
2595
2596        index = pos >> PAGE_SHIFT;
2597        from = pos & (PAGE_SIZE - 1);
2598        to = from + len;
2599
2600        page = grab_cache_page_write_begin(mapping, index, flags);
2601        if (!page)
2602                return -ENOMEM;
2603        *pagep = page;
2604        *fsdata = NULL;
2605
2606        if (page_has_buffers(page)) {
2607                ret = __block_write_begin(page, pos, len, get_block);
2608                if (unlikely(ret))
2609                        goto out_release;
2610                return ret;
2611        }
2612
2613        if (PageMappedToDisk(page))
2614                return 0;
2615
2616        /*
2617         * Allocate buffers so that we can keep track of state, and potentially
2618         * attach them to the page if an error occurs. In the common case of
2619         * no error, they will just be freed again without ever being attached
2620         * to the page (which is all OK, because we're under the page lock).
2621         *
2622         * Be careful: the buffer linked list is a NULL terminated one, rather
2623         * than the circular one we're used to.
2624         */
2625        head = alloc_page_buffers(page, blocksize, false);
2626        if (!head) {
2627                ret = -ENOMEM;
2628                goto out_release;
2629        }
2630
2631        block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits);
2632
2633        /*
2634         * We loop across all blocks in the page, whether or not they are
2635         * part of the affected region.  This is so we can discover if the
2636         * page is fully mapped-to-disk.
2637         */
2638        for (block_start = 0, block_in_page = 0, bh = head;
2639                  block_start < PAGE_SIZE;
2640                  block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
2641                int create;
2642
2643                block_end = block_start + blocksize;
2644                bh->b_state = 0;
2645                create = 1;
2646                if (block_start >= to)
2647                        create = 0;
2648                ret = get_block(inode, block_in_file + block_in_page,
2649                                        bh, create);
2650                if (ret)
2651                        goto failed;
2652                if (!buffer_mapped(bh))
2653                        is_mapped_to_disk = 0;
2654                if (buffer_new(bh))
2655                        clean_bdev_bh_alias(bh);
2656                if (PageUptodate(page)) {
2657                        set_buffer_uptodate(bh);
2658                        continue;
2659                }
2660                if (buffer_new(bh) || !buffer_mapped(bh)) {
2661                        zero_user_segments(page, block_start, from,
2662                                                        to, block_end);
2663                        continue;
2664                }
2665                if (buffer_uptodate(bh))
2666                        continue;       /* reiserfs does this */
2667                if (block_start < from || block_end > to) {
2668                        lock_buffer(bh);
2669                        bh->b_end_io = end_buffer_read_nobh;
2670                        submit_bh(REQ_OP_READ, 0, bh);
2671                        nr_reads++;
2672                }
2673        }
2674
2675        if (nr_reads) {
2676                /*
2677                 * The page is locked, so these buffers are protected from
2678                 * any VM or truncate activity.  Hence we don't need to care
2679                 * for the buffer_head refcounts.
2680                 */
2681                for (bh = head; bh; bh = bh->b_this_page) {
2682                        wait_on_buffer(bh);
2683                        if (!buffer_uptodate(bh))
2684                                ret = -EIO;
2685                }
2686                if (ret)
2687                        goto failed;
2688        }
2689
2690        if (is_mapped_to_disk)
2691                SetPageMappedToDisk(page);
2692
2693        *fsdata = head; /* to be released by nobh_write_end */
2694
2695        return 0;
2696
2697failed:
2698        BUG_ON(!ret);
2699        /*
2700         * Error recovery is a bit difficult. We need to zero out blocks that
2701         * were newly allocated, and dirty them to ensure they get written out.
2702         * Buffers need to be attached to the page at this point, otherwise
2703         * the handling of potential IO errors during writeout would be hard
2704         * (could try doing synchronous writeout, but what if that fails too?)
2705         */
2706        attach_nobh_buffers(page, head);
2707        page_zero_new_buffers(page, from, to);
2708
2709out_release:
2710        unlock_page(page);
2711        put_page(page);
2712        *pagep = NULL;
2713
2714        return ret;
2715}
2716EXPORT_SYMBOL(nobh_write_begin);
2717
2718int nobh_write_end(struct file *file, struct address_space *mapping,
2719                        loff_t pos, unsigned len, unsigned copied,
2720                        struct page *page, void *fsdata)
2721{
2722        struct inode *inode = page->mapping->host;
2723        struct buffer_head *head = fsdata;
2724        struct buffer_head *bh;
2725        BUG_ON(fsdata != NULL && page_has_buffers(page));
2726
2727        if (unlikely(copied < len) && head)
2728                attach_nobh_buffers(page, head);
2729        if (page_has_buffers(page))
2730                return generic_write_end(file, mapping, pos, len,
2731                                        copied, page, fsdata);
2732
2733        SetPageUptodate(page);
2734        set_page_dirty(page);
2735        if (pos+copied > inode->i_size) {
2736                i_size_write(inode, pos+copied);
2737                mark_inode_dirty(inode);
2738        }
2739
2740        unlock_page(page);
2741        put_page(page);
2742
2743        while (head) {
2744                bh = head;
2745                head = head->b_this_page;
2746                free_buffer_head(bh);
2747        }
2748
2749        return copied;
2750}
2751EXPORT_SYMBOL(nobh_write_end);
2752
2753/*
2754 * nobh_writepage() - based on block_full_write_page() except
2755 * that it tries to operate without attaching bufferheads to
2756 * the page.
2757 */
2758int nobh_writepage(struct page *page, get_block_t *get_block,
2759                        struct writeback_control *wbc)
2760{
2761        struct inode * const inode = page->mapping->host;
2762        loff_t i_size = i_size_read(inode);
2763        const pgoff_t end_index = i_size >> PAGE_SHIFT;
2764        unsigned offset;
2765        int ret;
2766
2767        /* Is the page fully inside i_size? */
2768        if (page->index < end_index)
2769                goto out;
2770
2771        /* Is the page fully outside i_size? (truncate in progress) */
2772        offset = i_size & (PAGE_SIZE-1);
2773        if (page->index >= end_index+1 || !offset) {
2774                /*
2775                 * The page may have dirty, unmapped buffers.  For example,
2776                 * they may have been added in ext3_writepage().  Make them
2777                 * freeable here, so the page does not leak.
2778                 */
2779#if 0
2780                /* Not really sure about this  - do we need this ? */
2781                if (page->mapping->a_ops->invalidatepage)
2782                        page->mapping->a_ops->invalidatepage(page, offset);
2783#endif
2784                unlock_page(page);
2785                return 0; /* don't care */
2786        }
2787
2788        /*
2789         * The page straddles i_size.  It must be zeroed out on each and every
2790         * writepage invocation because it may be mmapped.  "A file is mapped
2791         * in multiples of the page size.  For a file that is not a multiple of
2792         * the  page size, the remaining memory is zeroed when mapped, and
2793         * writes to that region are not written out to the file."
2794         */
2795        zero_user_segment(page, offset, PAGE_SIZE);
2796out:
2797        ret = mpage_writepage(page, get_block, wbc);
2798        if (ret == -EAGAIN)
2799                ret = __block_write_full_page(inode, page, get_block, wbc,
2800                                              end_buffer_async_write);
2801        return ret;
2802}
2803EXPORT_SYMBOL(nobh_writepage);
2804
2805int nobh_truncate_page(struct address_space *mapping,
2806                        loff_t from, get_block_t *get_block)
2807{
2808        pgoff_t index = from >> PAGE_SHIFT;
2809        unsigned offset = from & (PAGE_SIZE-1);
2810        unsigned blocksize;
2811        sector_t iblock;
2812        unsigned length, pos;
2813        struct inode *inode = mapping->host;
2814        struct page *page;
2815        struct buffer_head map_bh;
2816        int err;
2817
2818        blocksize = i_blocksize(inode);
2819        length = offset & (blocksize - 1);
2820
2821        /* Block boundary? Nothing to do */
2822        if (!length)
2823                return 0;
2824
2825        length = blocksize - length;
2826        iblock = (sector_t)index << (PAGE_SHIFT - inode->i_blkbits);
2827
2828        page = grab_cache_page(mapping, index);
2829        err = -ENOMEM;
2830        if (!page)
2831                goto out;
2832
2833        if (page_has_buffers(page)) {
2834has_buffers:
2835                unlock_page(page);
2836                put_page(page);
2837                return block_truncate_page(mapping, from, get_block);
2838        }
2839
2840        /* Find the buffer that contains "offset" */
2841        pos = blocksize;
2842        while (offset >= pos) {
2843                iblock++;
2844                pos += blocksize;
2845        }
2846
2847        map_bh.b_size = blocksize;
2848        map_bh.b_state = 0;
2849        err = get_block(inode, iblock, &map_bh, 0);
2850        if (err)
2851                goto unlock;
2852        /* unmapped? It's a hole - nothing to do */
2853        if (!buffer_mapped(&map_bh))
2854                goto unlock;
2855
2856        /* Ok, it's mapped. Make sure it's up-to-date */
2857        if (!PageUptodate(page)) {
2858                err = mapping->a_ops->readpage(NULL, page);
2859                if (err) {
2860                        put_page(page);
2861                        goto out;
2862                }
2863                lock_page(page);
2864                if (!PageUptodate(page)) {
2865                        err = -EIO;
2866                        goto unlock;
2867                }
2868                if (page_has_buffers(page))
2869                        goto has_buffers;
2870        }
2871        zero_user(page, offset, length);
2872        set_page_dirty(page);
2873        err = 0;
2874
2875unlock:
2876        unlock_page(page);
2877        put_page(page);
2878out:
2879        return err;
2880}
2881EXPORT_SYMBOL(nobh_truncate_page);
2882
2883int block_truncate_page(struct address_space *mapping,
2884                        loff_t from, get_block_t *get_block)
2885{
2886        pgoff_t index = from >> PAGE_SHIFT;
2887        unsigned offset = from & (PAGE_SIZE-1);
2888        unsigned blocksize;
2889        sector_t iblock;
2890        unsigned length, pos;
2891        struct inode *inode = mapping->host;
2892        struct page *page;
2893        struct buffer_head *bh;
2894        int err;
2895
2896        blocksize = i_blocksize(inode);
2897        length = offset & (blocksize - 1);
2898
2899        /* Block boundary? Nothing to do */
2900        if (!length)
2901                return 0;
2902
2903        length = blocksize - length;
2904        iblock = (sector_t)index << (PAGE_SHIFT - inode->i_blkbits);
2905        
2906        page = grab_cache_page(mapping, index);
2907        err = -ENOMEM;
2908        if (!page)
2909                goto out;
2910
2911        if (!page_has_buffers(page))
2912                create_empty_buffers(page, blocksize, 0);
2913
2914        /* Find the buffer that contains "offset" */
2915        bh = page_buffers(page);
2916        pos = blocksize;
2917        while (offset >= pos) {
2918                bh = bh->b_this_page;
2919                iblock++;
2920                pos += blocksize;
2921        }
2922
2923        err = 0;
2924        if (!buffer_mapped(bh)) {
2925                WARN_ON(bh->b_size != blocksize);
2926                err = get_block(inode, iblock, bh, 0);
2927                if (err)
2928                        goto unlock;
2929                /* unmapped? It's a hole - nothing to do */
2930                if (!buffer_mapped(bh))
2931                        goto unlock;
2932        }
2933
2934        /* Ok, it's mapped. Make sure it's up-to-date */
2935        if (PageUptodate(page))
2936                set_buffer_uptodate(bh);
2937
2938        if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
2939                err = -EIO;
2940                ll_rw_block(REQ_OP_READ, 0, 1, &bh);
2941                wait_on_buffer(bh);
2942                /* Uhhuh. Read error. Complain and punt. */
2943                if (!buffer_uptodate(bh))
2944                        goto unlock;
2945        }
2946
2947        zero_user(page, offset, length);
2948        mark_buffer_dirty(bh);
2949        err = 0;
2950
2951unlock:
2952        unlock_page(page);
2953        put_page(page);
2954out:
2955        return err;
2956}
2957EXPORT_SYMBOL(block_truncate_page);
2958
2959/*
2960 * The generic ->writepage function for buffer-backed address_spaces
2961 */
2962int block_write_full_page(struct page *page, get_block_t *get_block,
2963                        struct writeback_control *wbc)
2964{
2965        struct inode * const inode = page->mapping->host;
2966        loff_t i_size = i_size_read(inode);
2967        const pgoff_t end_index = i_size >> PAGE_SHIFT;
2968        unsigned offset;
2969
2970        /* Is the page fully inside i_size? */
2971        if (page->index < end_index)
2972                return __block_write_full_page(inode, page, get_block, wbc,
2973                                               end_buffer_async_write);
2974
2975        /* Is the page fully outside i_size? (truncate in progress) */
2976        offset = i_size & (PAGE_SIZE-1);
2977        if (page->index >= end_index+1 || !offset) {
2978                /*
2979                 * The page may have dirty, unmapped buffers.  For example,
2980                 * they may have been added in ext3_writepage().  Make them
2981                 * freeable here, so the page does not leak.
2982                 */
2983                do_invalidatepage(page, 0, PAGE_SIZE);
2984                unlock_page(page);
2985                return 0; /* don't care */
2986        }
2987
2988        /*
2989         * The page straddles i_size.  It must be zeroed out on each and every
2990         * writepage invocation because it may be mmapped.  "A file is mapped
2991         * in multiples of the page size.  For a file that is not a multiple of
2992         * the  page size, the remaining memory is zeroed when mapped, and
2993         * writes to that region are not written out to the file."
2994         */
2995        zero_user_segment(page, offset, PAGE_SIZE);
2996        return __block_write_full_page(inode, page, get_block, wbc,
2997                                                        end_buffer_async_write);
2998}
2999EXPORT_SYMBOL(block_write_full_page);
3000
3001sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
3002                            get_block_t *get_block)
3003{
3004        struct inode *inode = mapping->host;
3005        struct buffer_head tmp = {
3006                .b_size = i_blocksize(inode),
3007        };
3008
3009        get_block(inode, block, &tmp, 0);
3010        return tmp.b_blocknr;
3011}
3012EXPORT_SYMBOL(generic_block_bmap);
3013
3014static void end_bio_bh_io_sync(struct bio *bio)
3015{
3016        struct buffer_head *bh = bio->bi_private;
3017
3018        if (unlikely(bio_flagged(bio, BIO_QUIET)))
3019                set_bit(BH_Quiet, &bh->b_state);
3020
3021        bh->b_end_io(bh, !bio->bi_status);
3022        bio_put(bio);
3023}
3024
3025static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
3026                         enum rw_hint write_hint, struct writeback_control *wbc)
3027{
3028        struct bio *bio;
3029
3030        BUG_ON(!buffer_locked(bh));
3031        BUG_ON(!buffer_mapped(bh));
3032        BUG_ON(!bh->b_end_io);
3033        BUG_ON(buffer_delay(bh));
3034        BUG_ON(buffer_unwritten(bh));
3035
3036        /*
3037         * Only clear out a write error when rewriting
3038         */
3039        if (test_set_buffer_req(bh) && (op == REQ_OP_WRITE))
3040                clear_buffer_write_io_error(bh);
3041
3042        bio = bio_alloc(GFP_NOIO, 1);
3043
3044        fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);
3045
3046        bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
3047        bio_set_dev(bio, bh->b_bdev);
3048        bio->bi_write_hint = write_hint;
3049
3050        bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
3051        BUG_ON(bio->bi_iter.bi_size != bh->b_size);
3052
3053        bio->bi_end_io = end_bio_bh_io_sync;
3054        bio->bi_private = bh;
3055
3056        if (buffer_meta(bh))
3057                op_flags |= REQ_META;
3058        if (buffer_prio(bh))
3059                op_flags |= REQ_PRIO;
3060        bio_set_op_attrs(bio, op, op_flags);
3061
3062        /* Take care of bh's that straddle the end of the device */
3063        guard_bio_eod(bio);
3064
3065        if (wbc) {
3066                wbc_init_bio(wbc, bio);
3067                wbc_account_cgroup_owner(wbc, bh->b_page, bh->b_size);
3068        }
3069
3070        submit_bio(bio);
3071        return 0;
3072}
3073
3074int submit_bh(int op, int op_flags, struct buffer_head *bh)
3075{
3076        return submit_bh_wbc(op, op_flags, bh, 0, NULL);
3077}
3078EXPORT_SYMBOL(submit_bh);
3079
3080/**
3081 * ll_rw_block: low-level access to block devices (DEPRECATED)
3082 * @op: whether to %READ or %WRITE
3083 * @op_flags: req_flag_bits
3084 * @nr: number of &struct buffer_heads in the array
3085 * @bhs: array of pointers to &struct buffer_head
3086 *
3087 * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
3088 * requests an I/O operation on them, either a %REQ_OP_READ or a %REQ_OP_WRITE.
3089 * @op_flags contains flags modifying the detailed I/O behavior, most notably
3090 * %REQ_RAHEAD.
3091 *
3092 * This function drops any buffer that it cannot get a lock on (with the
3093 * BH_Lock state bit), any buffer that appears to be clean when doing a write
3094 * request, and any buffer that appears to be up-to-date when doing read
3095 * request.  Further it marks as clean buffers that are processed for
3096 * writing (the buffer cache won't assume that they are actually clean
3097 * until the buffer gets unlocked).
3098 *
3099 * ll_rw_block sets b_end_io to simple completion handler that marks
3100 * the buffer up-to-date (if appropriate), unlocks the buffer and wakes
3101 * any waiters. 
3102 *
3103 * All of the buffers must be for the same device, and must also be a
3104 * multiple of the current approved size for the device.
3105 */
3106void ll_rw_block(int op, int op_flags,  int nr, struct buffer_head *bhs[])
3107{
3108        int i;
3109
3110        for (i = 0; i < nr; i++) {
3111                struct buffer_head *bh = bhs[i];
3112
3113                if (!trylock_buffer(bh))
3114                        continue;
3115                if (op == WRITE) {
3116                        if (test_clear_buffer_dirty(bh)) {
3117                                bh->b_end_io = end_buffer_write_sync;
3118                                get_bh(bh);
3119                                submit_bh(op, op_flags, bh);
3120                                continue;
3121                        }
3122                } else {
3123                        if (!buffer_uptodate(bh)) {
3124                                bh->b_end_io = end_buffer_read_sync;
3125                                get_bh(bh);
3126                                submit_bh(op, op_flags, bh);
3127                                continue;
3128                        }
3129                }
3130                unlock_buffer(bh);
3131        }
3132}
3133EXPORT_SYMBOL(ll_rw_block);
3134
3135void write_dirty_buffer(struct buffer_head *bh, int op_flags)
3136{
3137        lock_buffer(bh);
3138        if (!test_clear_buffer_dirty(bh)) {
3139                unlock_buffer(bh);
3140                return;
3141        }
3142        bh->b_end_io = end_buffer_write_sync;
3143        get_bh(bh);
3144        submit_bh(REQ_OP_WRITE, op_flags, bh);
3145}
3146EXPORT_SYMBOL(write_dirty_buffer);
3147
3148/*
3149 * For a data-integrity writeout, we need to wait upon any in-progress I/O
3150 * and then start new I/O and then wait upon it.  The caller must have a ref on
3151 * the buffer_head.
3152 */
3153int __sync_dirty_buffer(struct buffer_head *bh, int op_flags)
3154{
3155        int ret = 0;
3156
3157        WARN_ON(atomic_read(&bh->b_count) < 1);
3158        lock_buffer(bh);
3159        if (test_clear_buffer_dirty(bh)) {
3160                /*
3161                 * The bh should be mapped, but it might not be if the
3162                 * device was hot-removed. Not much we can do but fail the I/O.
3163                 */
3164                if (!buffer_mapped(bh)) {
3165                        unlock_buffer(bh);
3166                        return -EIO;
3167                }
3168
3169                get_bh(bh);
3170                bh->b_end_io = end_buffer_write_sync;
3171                ret = submit_bh(REQ_OP_WRITE, op_flags, bh);
3172                wait_on_buffer(bh);
3173                if (!ret && !buffer_uptodate(bh))
3174                        ret = -EIO;
3175        } else {
3176                unlock_buffer(bh);
3177        }
3178        return ret;
3179}
3180EXPORT_SYMBOL(__sync_dirty_buffer);
3181
3182int sync_dirty_buffer(struct buffer_head *bh)
3183{
3184        return __sync_dirty_buffer(bh, REQ_SYNC);
3185}
3186EXPORT_SYMBOL(sync_dirty_buffer);
3187
3188/*
3189 * try_to_free_buffers() checks if all the buffers on this particular page
3190 * are unused, and releases them if so.
3191 *
3192 * Exclusion against try_to_free_buffers may be obtained by either
3193 * locking the page or by holding its mapping's private_lock.
3194 *
3195 * If the page is dirty but all the buffers are clean then we need to
3196 * be sure to mark the page clean as well.  This is because the page
3197 * may be against a block device, and a later reattachment of buffers
3198 * to a dirty page will set *all* buffers dirty.  Which would corrupt
3199 * filesystem data on the same device.
3200 *
3201 * The same applies to regular filesystem pages: if all the buffers are
3202 * clean then we set the page clean and proceed.  To do that, we require
3203 * total exclusion from __set_page_dirty_buffers().  That is obtained with
3204 * private_lock.
3205 *
3206 * try_to_free_buffers() is non-blocking.
3207 */
3208static inline int buffer_busy(struct buffer_head *bh)
3209{
3210        return atomic_read(&bh->b_count) |
3211                (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
3212}
3213
3214static int
3215drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
3216{
3217        struct buffer_head *head = page_buffers(page);
3218        struct buffer_head *bh;
3219
3220        bh = head;
3221        do {
3222                if (buffer_busy(bh))
3223                        goto failed;
3224                bh = bh->b_this_page;
3225        } while (bh != head);
3226
3227        do {
3228                struct buffer_head *next = bh->b_this_page;
3229
3230                if (bh->b_assoc_map)
3231                        __remove_assoc_queue(bh);
3232                bh = next;
3233        } while (bh != head);
3234        *buffers_to_free = head;
3235        detach_page_private(page);
3236        return 1;
3237failed:
3238        return 0;
3239}
3240
3241int try_to_free_buffers(struct page *page)
3242{
3243        struct address_space * const mapping = page->mapping;
3244        struct buffer_head *buffers_to_free = NULL;
3245        int ret = 0;
3246
3247        BUG_ON(!PageLocked(page));
3248        if (PageWriteback(page))
3249                return 0;
3250
3251        if (mapping == NULL) {          /* can this still happen? */
3252                ret = drop_buffers(page, &buffers_to_free);
3253                goto out;
3254        }
3255
3256        spin_lock(&mapping->private_lock);
3257        ret = drop_buffers(page, &buffers_to_free);
3258
3259        /*
3260         * If the filesystem writes its buffers by hand (eg ext3)
3261         * then we can have clean buffers against a dirty page.  We
3262         * clean the page here; otherwise the VM will never notice
3263         * that the filesystem did any IO at all.
3264         *
3265         * Also, during truncate, discard_buffer will have marked all
3266         * the page's buffers clean.  We discover that here and clean
3267         * the page also.
3268         *
3269         * private_lock must be held over this entire operation in order
3270         * to synchronise against __set_page_dirty_buffers and prevent the
3271         * dirty bit from being lost.
3272         */
3273        if (ret)
3274                cancel_dirty_page(page);
3275        spin_unlock(&mapping->private_lock);
3276out:
3277        if (buffers_to_free) {
3278                struct buffer_head *bh = buffers_to_free;
3279
3280                do {
3281                        struct buffer_head *next = bh->b_this_page;
3282                        free_buffer_head(bh);
3283                        bh = next;
3284                } while (bh != buffers_to_free);
3285        }
3286        return ret;
3287}
3288EXPORT_SYMBOL(try_to_free_buffers);
3289
3290/*
3291 * There are no bdflush tunables left.  But distributions are
3292 * still running obsolete flush daemons, so we terminate them here.
3293 *
3294 * Use of bdflush() is deprecated and will be removed in a future kernel.
3295 * The `flush-X' kernel threads fully replace bdflush daemons and this call.
3296 */
3297SYSCALL_DEFINE2(bdflush, int, func, long, data)
3298{
3299        static int msg_count;
3300
3301        if (!capable(CAP_SYS_ADMIN))
3302                return -EPERM;
3303
3304        if (msg_count < 5) {
3305                msg_count++;
3306                printk(KERN_INFO
3307                        "warning: process `%s' used the obsolete bdflush"
3308                        " system call\n", current->comm);
3309                printk(KERN_INFO "Fix your initscripts?\n");
3310        }
3311
3312        if (func == 1)
3313                do_exit(0);
3314        return 0;
3315}
3316
3317/*
3318 * Buffer-head allocation
3319 */
3320static struct kmem_cache *bh_cachep __read_mostly;
3321
3322/*
3323 * Once the number of bh's in the machine exceeds this level, we start
3324 * stripping them in writeback.
3325 */
3326static unsigned long max_buffer_heads;
3327
3328int buffer_heads_over_limit;
3329
3330struct bh_accounting {
3331        int nr;                 /* Number of live bh's */
3332        int ratelimit;          /* Limit cacheline bouncing */
3333};
3334
3335static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
3336
3337static void recalc_bh_state(void)
3338{
3339        int i;
3340        int tot = 0;
3341
3342        if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096)
3343                return;
3344        __this_cpu_write(bh_accounting.ratelimit, 0);
3345        for_each_online_cpu(i)
3346                tot += per_cpu(bh_accounting, i).nr;
3347        buffer_heads_over_limit = (tot > max_buffer_heads);
3348}
3349
3350struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
3351{
3352        struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
3353        if (ret) {
3354                INIT_LIST_HEAD(&ret->b_assoc_buffers);
3355                spin_lock_init(&ret->b_uptodate_lock);
3356                preempt_disable();
3357                __this_cpu_inc(bh_accounting.nr);
3358                recalc_bh_state();
3359                preempt_enable();
3360        }
3361        return ret;
3362}
3363EXPORT_SYMBOL(alloc_buffer_head);
3364
3365void free_buffer_head(struct buffer_head *bh)
3366{
3367        BUG_ON(!list_empty(&bh->b_assoc_buffers));
3368        kmem_cache_free(bh_cachep, bh);
3369        preempt_disable();
3370        __this_cpu_dec(bh_accounting.nr);
3371        recalc_bh_state();
3372        preempt_enable();
3373}
3374EXPORT_SYMBOL(free_buffer_head);
3375
3376static int buffer_exit_cpu_dead(unsigned int cpu)
3377{
3378        int i;
3379        struct bh_lru *b = &per_cpu(bh_lrus, cpu);
3380
3381        for (i = 0; i < BH_LRU_SIZE; i++) {
3382                brelse(b->bhs[i]);
3383                b->bhs[i] = NULL;
3384        }
3385        this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
3386        per_cpu(bh_accounting, cpu).nr = 0;
3387        return 0;
3388}
3389
3390/**
3391 * bh_uptodate_or_lock - Test whether the buffer is uptodate
3392 * @bh: struct buffer_head
3393 *
3394 * Return true if the buffer is up-to-date and false,
3395 * with the buffer locked, if not.
3396 */
3397int bh_uptodate_or_lock(struct buffer_head *bh)
3398{
3399        if (!buffer_uptodate(bh)) {
3400                lock_buffer(bh);
3401                if (!buffer_uptodate(bh))
3402                        return 0;
3403                unlock_buffer(bh);
3404        }
3405        return 1;
3406}
3407EXPORT_SYMBOL(bh_uptodate_or_lock);
3408
3409/**
3410 * bh_submit_read - Submit a locked buffer for reading
3411 * @bh: struct buffer_head
3412 *
3413 * Returns zero on success and -EIO on error.
3414 */
3415int bh_submit_read(struct buffer_head *bh)
3416{
3417        BUG_ON(!buffer_locked(bh));
3418
3419        if (buffer_uptodate(bh)) {
3420                unlock_buffer(bh);
3421                return 0;
3422        }
3423
3424        get_bh(bh);
3425        bh->b_end_io = end_buffer_read_sync;
3426        submit_bh(REQ_OP_READ, 0, bh);
3427        wait_on_buffer(bh);
3428        if (buffer_uptodate(bh))
3429                return 0;
3430        return -EIO;
3431}
3432EXPORT_SYMBOL(bh_submit_read);
3433
3434void __init buffer_init(void)
3435{
3436        unsigned long nrpages;
3437        int ret;
3438
3439        bh_cachep = kmem_cache_create("buffer_head",
3440                        sizeof(struct buffer_head), 0,
3441                                (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
3442                                SLAB_MEM_SPREAD),
3443                                NULL);
3444
3445        /*
3446         * Limit the bh occupancy to 10% of ZONE_NORMAL
3447         */
3448        nrpages = (nr_free_buffer_pages() * 10) / 100;
3449        max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3450        ret = cpuhp_setup_state_nocalls(CPUHP_FS_BUFF_DEAD, "fs/buffer:dead",
3451                                        NULL, buffer_exit_cpu_dead);
3452        WARN_ON(ret < 0);
3453}
3454