LXR linux/mm/filemap.c

   1/*
   2 *      linux/mm/filemap.c
   3 *
   4 * Copyright (C) 1994-1999  Linus Torvalds
   5 */
   6
   7/*
   8 * This file handles the generic file mmap semantics used by
   9 * most "normal" filesystems (but you don't /have/ to use this:
  10 * the NFS filesystem used to do this differently, for example)
  11 */
  12#include <linux/module.h>
  13#include <linux/slab.h>
  14#include <linux/compiler.h>
  15#include <linux/fs.h>
  16#include <linux/uaccess.h>
  17#include <linux/aio.h>
  18#include <linux/capability.h>
  19#include <linux/kernel_stat.h>
  20#include <linux/mm.h>
  21#include <linux/swap.h>
  22#include <linux/mman.h>
  23#include <linux/pagemap.h>
  24#include <linux/file.h>
  25#include <linux/uio.h>
  26#include <linux/hash.h>
  27#include <linux/writeback.h>
  28#include <linux/backing-dev.h>
  29#include <linux/pagevec.h>
  30#include <linux/blkdev.h>
  31#include <linux/security.h>
  32#include <linux/syscalls.h>
  33#include <linux/cpuset.h>
  34#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
  35#include <linux/memcontrol.h>
  36#include <linux/mm_inline.h> /* for page_is_file_cache() */
  37#include "internal.h"
  38
  39/*
  40 * FIXME: remove all knowledge of the buffer layer from the core VM
  41 */
  42#include <linux/buffer_head.h> /* for try_to_free_buffers */
  43
  44#include <asm/mman.h>
  45
  46/*
  47 * Shared mappings implemented 30.11.1994. It's not fully working yet,
  48 * though.
  49 *
  50 * Shared mappings now work. 15.8.1995  Bruno.
  51 *
  52 * finished 'unifying' the page and buffer cache and SMP-threaded the
  53 * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
  54 *
  55 * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
  56 */
  57
  58/*
  59 * Lock ordering:
  60 *
  61 *  ->i_mmap_lock               (truncate_pagecache)
  62 *    ->private_lock            (__free_pte->__set_page_dirty_buffers)
  63 *      ->swap_lock             (exclusive_swap_page, others)
  64 *        ->mapping->tree_lock
  65 *
  66 *  ->i_mutex
  67 *    ->i_mmap_lock             (truncate->unmap_mapping_range)
  68 *
  69 *  ->mmap_sem
  70 *    ->i_mmap_lock
  71 *      ->page_table_lock or pte_lock   (various, mainly in memory.c)
  72 *        ->mapping->tree_lock  (arch-dependent flush_dcache_mmap_lock)
  73 *
  74 *  ->mmap_sem
  75 *    ->lock_page               (access_process_vm)
  76 *
  77 *  ->i_mutex                   (generic_file_buffered_write)
  78 *    ->mmap_sem                (fault_in_pages_readable->do_page_fault)
  79 *
  80 *  ->i_mutex
  81 *    ->i_alloc_sem             (various)
  82 *
  83 *  ->inode_lock
  84 *    ->sb_lock                 (fs/fs-writeback.c)
  85 *    ->mapping->tree_lock      (__sync_single_inode)
  86 *
  87 *  ->i_mmap_lock
  88 *    ->anon_vma.lock           (vma_adjust)
  89 *
  90 *  ->anon_vma.lock
  91 *    ->page_table_lock or pte_lock     (anon_vma_prepare and various)
  92 *
  93 *  ->page_table_lock or pte_lock
  94 *    ->swap_lock               (try_to_unmap_one)
  95 *    ->private_lock            (try_to_unmap_one)
  96 *    ->tree_lock               (try_to_unmap_one)
  97 *    ->zone.lru_lock           (follow_page->mark_page_accessed)
  98 *    ->zone.lru_lock           (check_pte_range->isolate_lru_page)
  99 *    ->private_lock            (page_remove_rmap->set_page_dirty)
 100 *    ->tree_lock               (page_remove_rmap->set_page_dirty)
 101 *    ->inode_lock              (page_remove_rmap->set_page_dirty)
 102 *    ->inode_lock              (zap_pte_range->set_page_dirty)
 103 *    ->private_lock            (zap_pte_range->__set_page_dirty_buffers)
 104 *
 105 *  ->task->proc_lock
 106 *    ->dcache_lock             (proc_pid_lookup)
 107 *
 108 *  (code doesn't rely on that order, so you could switch it around)
 109 *  ->tasklist_lock             (memory_failure, collect_procs_ao)
 110 *    ->i_mmap_lock
 111 */
 112
 113/*
 114 * Remove a page from the page cache and free it. Caller has to make
 115 * sure the page is locked and that nobody else uses it - or that usage
 116 * is safe.  The caller must hold the mapping's tree_lock.
 117 */
 118void __remove_from_page_cache(struct page *page)
 119{
 120        struct address_space *mapping = page->mapping;
 121
 122        radix_tree_delete(&mapping->page_tree, page->index);
 123        page->mapping = NULL;
 124        mapping->nrpages--;
 125        __dec_zone_page_state(page, NR_FILE_PAGES);
 126        if (PageSwapBacked(page))
 127                __dec_zone_page_state(page, NR_SHMEM);
 128        BUG_ON(page_mapped(page));
 129
 130        /*
 131         * Some filesystems seem to re-dirty the page even after
 132         * the VM has canceled the dirty bit (eg ext3 journaling).
 133         *
 134         * Fix it up by doing a final dirty accounting check after
 135         * having removed the page entirely.
 136         */
 137        if (PageDirty(page) && mapping_cap_account_dirty(mapping)) {
 138                dec_zone_page_state(page, NR_FILE_DIRTY);
 139                dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
 140        }
 141}
 142
 143void remove_from_page_cache(struct page *page)
 144{
 145        struct address_space *mapping = page->mapping;
 146
 147        BUG_ON(!PageLocked(page));
 148
 149        spin_lock_irq(&mapping->tree_lock);
 150        __remove_from_page_cache(page);
 151        spin_unlock_irq(&mapping->tree_lock);
 152        mem_cgroup_uncharge_cache_page(page);
 153}
 154
 155static int sync_page(void *word)
 156{
 157        struct address_space *mapping;
 158        struct page *page;
 159
 160        page = container_of((unsigned long *)word, struct page, flags);
 161
 162        /*
 163         * page_mapping() is being called without PG_locked held.
 164         * Some knowledge of the state and use of the page is used to
 165         * reduce the requirements down to a memory barrier.
 166         * The danger here is of a stale page_mapping() return value
 167         * indicating a struct address_space different from the one it's
 168         * associated with when it is associated with one.
 169         * After smp_mb(), it's either the correct page_mapping() for
 170         * the page, or an old page_mapping() and the page's own
 171         * page_mapping() has gone NULL.
 172         * The ->sync_page() address_space operation must tolerate
 173         * page_mapping() going NULL. By an amazing coincidence,
 174         * this comes about because none of the users of the page
 175         * in the ->sync_page() methods make essential use of the
 176         * page_mapping(), merely passing the page down to the backing
 177         * device's unplug functions when it's non-NULL, which in turn
 178         * ignore it for all cases but swap, where only page_private(page) is
 179         * of interest. When page_mapping() does go NULL, the entire
 180         * call stack gracefully ignores the page and returns.
 181         * -- wli
 182         */
 183        smp_mb();
 184        mapping = page_mapping(page);
 185        if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
 186                mapping->a_ops->sync_page(page);
 187        io_schedule();
 188        return 0;
 189}
 190
 191static int sync_page_killable(void *word)
 192{
 193        sync_page(word);
 194        return fatal_signal_pending(current) ? -EINTR : 0;
 195}
 196
 197/**
 198 * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
 199 * @mapping:    address space structure to write
 200 * @start:      offset in bytes where the range starts
 201 * @end:        offset in bytes where the range ends (inclusive)
 202 * @sync_mode:  enable synchronous operation
 203 *
 204 * Start writeback against all of a mapping's dirty pages that lie
 205 * within the byte offsets <start, end> inclusive.
 206 *
 207 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
 208 * opposed to a regular memory cleansing writeback.  The difference between
 209 * these two operations is that if a dirty page/buffer is encountered, it must
 210 * be waited upon, and not just skipped over.
 211 */
 212int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
 213                                loff_t end, int sync_mode)
 214{
 215        int ret;
 216        struct writeback_control wbc = {
 217                .sync_mode = sync_mode,
 218                .nr_to_write = LONG_MAX,
 219                .range_start = start,
 220                .range_end = end,
 221        };
 222
 223        if (!mapping_cap_writeback_dirty(mapping))
 224                return 0;
 225
 226        ret = do_writepages(mapping, &wbc);
 227        return ret;
 228}
 229
 230static inline int __filemap_fdatawrite(struct address_space *mapping,
 231        int sync_mode)
 232{
 233        return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode);
 234}
 235
 236int filemap_fdatawrite(struct address_space *mapping)
 237{
 238        return __filemap_fdatawrite(mapping, WB_SYNC_ALL);
 239}
 240EXPORT_SYMBOL(filemap_fdatawrite);
 241
 242int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
 243                                loff_t end)
 244{
 245        return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
 246}
 247EXPORT_SYMBOL(filemap_fdatawrite_range);
 248
 249/**
 250 * filemap_flush - mostly a non-blocking flush
 251 * @mapping:    target address_space
 252 *
 253 * This is a mostly non-blocking flush.  Not suitable for data-integrity
 254 * purposes - I/O may not be started against all dirty pages.
 255 */
 256int filemap_flush(struct address_space *mapping)
 257{
 258        return __filemap_fdatawrite(mapping, WB_SYNC_NONE);
 259}
 260EXPORT_SYMBOL(filemap_flush);
 261
 262/**
 263 * wait_on_page_writeback_range - wait for writeback to complete
 264 * @mapping:    target address_space
 265 * @start:      beginning page index
 266 * @end:        ending page index
 267 *
 268 * Wait for writeback to complete against pages indexed by start->end
 269 * inclusive
 270 */
 271int wait_on_page_writeback_range(struct address_space *mapping,
 272                                pgoff_t start, pgoff_t end)
 273{
 274        struct pagevec pvec;
 275        int nr_pages;
 276        int ret = 0;
 277        pgoff_t index;
 278
 279        if (end < start)
 280                return 0;
 281
 282        pagevec_init(&pvec, 0);
 283        index = start;
 284        while ((index <= end) &&
 285                        (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
 286                        PAGECACHE_TAG_WRITEBACK,
 287                        min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
 288                unsigned i;
 289
 290                for (i = 0; i < nr_pages; i++) {
 291                        struct page *page = pvec.pages[i];
 292
 293                        /* until radix tree lookup accepts end_index */
 294                        if (page->index > end)
 295                                continue;
 296
 297                        wait_on_page_writeback(page);
 298                        if (PageError(page))
 299                                ret = -EIO;
 300                }
 301                pagevec_release(&pvec);
 302                cond_resched();
 303        }
 304
 305        /* Check for outstanding write errors */
 306        if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
 307                ret = -ENOSPC;
 308        if (test_and_clear_bit(AS_EIO, &mapping->flags))
 309                ret = -EIO;
 310
 311        return ret;
 312}
 313
 314/**
 315 * filemap_fdatawait_range - wait for all under-writeback pages to complete in a given range
 316 * @mapping: address space structure to wait for
 317 * @start:      offset in bytes where the range starts
 318 * @end:        offset in bytes where the range ends (inclusive)
 319 *
 320 * Walk the list of under-writeback pages of the given address space
 321 * in the given range and wait for all of them.
 322 *
 323 * This is just a simple wrapper so that callers don't have to convert offsets
 324 * to page indexes themselves
 325 */
 326int filemap_fdatawait_range(struct address_space *mapping, loff_t start,
 327                            loff_t end)
 328{
 329        return wait_on_page_writeback_range(mapping, start >> PAGE_CACHE_SHIFT,
 330                                            end >> PAGE_CACHE_SHIFT);
 331}
 332EXPORT_SYMBOL(filemap_fdatawait_range);
 333
 334/**
 335 * filemap_fdatawait - wait for all under-writeback pages to complete
 336 * @mapping: address space structure to wait for
 337 *
 338 * Walk the list of under-writeback pages of the given address space
 339 * and wait for all of them.
 340 */
 341int filemap_fdatawait(struct address_space *mapping)
 342{
 343        loff_t i_size = i_size_read(mapping->host);
 344
 345        if (i_size == 0)
 346                return 0;
 347
 348        return wait_on_page_writeback_range(mapping, 0,
 349                                (i_size - 1) >> PAGE_CACHE_SHIFT);
 350}
 351EXPORT_SYMBOL(filemap_fdatawait);
 352
 353int filemap_write_and_wait(struct address_space *mapping)
 354{
 355        int err = 0;
 356
 357        if (mapping->nrpages) {
 358                err = filemap_fdatawrite(mapping);
 359                /*
 360                 * Even if the above returned error, the pages may be
 361                 * written partially (e.g. -ENOSPC), so we wait for it.
 362                 * But the -EIO is special case, it may indicate the worst
 363                 * thing (e.g. bug) happened, so we avoid waiting for it.
 364                 */
 365                if (err != -EIO) {
 366                        int err2 = filemap_fdatawait(mapping);
 367                        if (!err)
 368                                err = err2;
 369                }
 370        }
 371        return err;
 372}
 373EXPORT_SYMBOL(filemap_write_and_wait);
 374
 375/**
 376 * filemap_write_and_wait_range - write out & wait on a file range
 377 * @mapping:    the address_space for the pages
 378 * @lstart:     offset in bytes where the range starts
 379 * @lend:       offset in bytes where the range ends (inclusive)
 380 *
 381 * Write out and wait upon file offsets lstart->lend, inclusive.
 382 *
 383 * Note that `lend' is inclusive (describes the last byte to be written) so
 384 * that this function can be used to write to the very end-of-file (end = -1).
 385 */
 386int filemap_write_and_wait_range(struct address_space *mapping,
 387                                 loff_t lstart, loff_t lend)
 388{
 389        int err = 0;
 390
 391        if (mapping->nrpages) {
 392                err = __filemap_fdatawrite_range(mapping, lstart, lend,
 393                                                 WB_SYNC_ALL);
 394                /* See comment of filemap_write_and_wait() */
 395                if (err != -EIO) {
 396                        int err2 = wait_on_page_writeback_range(mapping,
 397                                                lstart >> PAGE_CACHE_SHIFT,
 398                                                lend >> PAGE_CACHE_SHIFT);
 399                        if (!err)
 400                                err = err2;
 401                }
 402        }
 403        return err;
 404}
 405EXPORT_SYMBOL(filemap_write_and_wait_range);
 406
 407/**
 408 * add_to_page_cache_locked - add a locked page to the pagecache
 409 * @page:       page to add
 410 * @mapping:    the page's address_space
 411 * @offset:     page index
 412 * @gfp_mask:   page allocation mode
 413 *
 414 * This function is used to add a page to the pagecache. It must be locked.
 415 * This function does not add the page to the LRU.  The caller must do that.
 416 */
 417int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
 418                pgoff_t offset, gfp_t gfp_mask)
 419{
 420        int error;
 421
 422        VM_BUG_ON(!PageLocked(page));
 423
 424        error = mem_cgroup_cache_charge(page, current->mm,
 425                                        gfp_mask & GFP_RECLAIM_MASK);
 426        if (error)
 427                goto out;
 428
 429        error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
 430        if (error == 0) {
 431                page_cache_get(page);
 432                page->mapping = mapping;
 433                page->index = offset;
 434
 435                spin_lock_irq(&mapping->tree_lock);
 436                error = radix_tree_insert(&mapping->page_tree, offset, page);
 437                if (likely(!error)) {
 438                        mapping->nrpages++;
 439                        __inc_zone_page_state(page, NR_FILE_PAGES);
 440                        if (PageSwapBacked(page))
 441                                __inc_zone_page_state(page, NR_SHMEM);
 442                        spin_unlock_irq(&mapping->tree_lock);
 443                } else {
 444                        page->mapping = NULL;
 445                        spin_unlock_irq(&mapping->tree_lock);
 446                        mem_cgroup_uncharge_cache_page(page);
 447                        page_cache_release(page);
 448                }
 449                radix_tree_preload_end();
 450        } else
 451                mem_cgroup_uncharge_cache_page(page);
 452out:
 453        return error;
 454}
 455EXPORT_SYMBOL(add_to_page_cache_locked);
 456
 457int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
 458                                pgoff_t offset, gfp_t gfp_mask)
 459{
 460        int ret;
 461
 462        /*
 463         * Splice_read and readahead add shmem/tmpfs pages into the page cache
 464         * before shmem_readpage has a chance to mark them as SwapBacked: they
 465         * need to go on the active_anon lru below, and mem_cgroup_cache_charge
 466         * (called in add_to_page_cache) needs to know where they're going too.
 467         */
 468        if (mapping_cap_swap_backed(mapping))
 469                SetPageSwapBacked(page);
 470
 471        ret = add_to_page_cache(page, mapping, offset, gfp_mask);
 472        if (ret == 0) {
 473                if (page_is_file_cache(page))
 474                        lru_cache_add_file(page);
 475                else
 476                        lru_cache_add_active_anon(page);
 477        }
 478        return ret;
 479}
 480EXPORT_SYMBOL_GPL(add_to_page_cache_lru);
 481
 482#ifdef CONFIG_NUMA
 483struct page *__page_cache_alloc(gfp_t gfp)
 484{
 485        if (cpuset_do_page_mem_spread()) {
 486                int n = cpuset_mem_spread_node();
 487                return alloc_pages_exact_node(n, gfp, 0);
 488        }
 489        return alloc_pages(gfp, 0);
 490}
 491EXPORT_SYMBOL(__page_cache_alloc);
 492#endif
 493
 494static int __sleep_on_page_lock(void *word)
 495{
 496        io_schedule();
 497        return 0;
 498}
 499
 500/*
 501 * In order to wait for pages to become available there must be
 502 * waitqueues associated with pages. By using a hash table of
 503 * waitqueues where the bucket discipline is to maintain all
 504 * waiters on the same queue and wake all when any of the pages
 505 * become available, and for the woken contexts to check to be
 506 * sure the appropriate page became available, this saves space
 507 * at a cost of "thundering herd" phenomena during rare hash
 508 * collisions.
 509 */
 510static wait_queue_head_t *page_waitqueue(struct page *page)
 511{
 512        const struct zone *zone = page_zone(page);
 513
 514        return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)];
 515}
 516
 517static inline void wake_up_page(struct page *page, int bit)
 518{
 519        __wake_up_bit(page_waitqueue(page), &page->flags, bit);
 520}
 521
 522void wait_on_page_bit(struct page *page, int bit_nr)
 523{
 524        DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
 525
 526        if (test_bit(bit_nr, &page->flags))
 527                __wait_on_bit(page_waitqueue(page), &wait, sync_page,
 528                                                        TASK_UNINTERRUPTIBLE);
 529}
 530EXPORT_SYMBOL(wait_on_page_bit);
 531
 532/**
 533 * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
 534 * @page: Page defining the wait queue of interest
 535 * @waiter: Waiter to add to the queue
 536 *
 537 * Add an arbitrary @waiter to the wait queue for the nominated @page.
 538 */
 539void add_page_wait_queue(struct page *page, wait_queue_t *waiter)
 540{
 541        wait_queue_head_t *q = page_waitqueue(page);
 542        unsigned long flags;
 543
 544        spin_lock_irqsave(&q->lock, flags);
 545        __add_wait_queue(q, waiter);
 546        spin_unlock_irqrestore(&q->lock, flags);
 547}
 548EXPORT_SYMBOL_GPL(add_page_wait_queue);
 549
 550/**
 551 * unlock_page - unlock a locked page
 552 * @page: the page
 553 *
 554 * Unlocks the page and wakes up sleepers in ___wait_on_page_locked().
 555 * Also wakes sleepers in wait_on_page_writeback() because the wakeup
 556 * mechananism between PageLocked pages and PageWriteback pages is shared.
 557 * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
 558 *
 559 * The mb is necessary to enforce ordering between the clear_bit and the read
 560 * of the waitqueue (to avoid SMP races with a parallel wait_on_page_locked()).
 561 */
 562void unlock_page(struct page *page)
 563{
 564        VM_BUG_ON(!PageLocked(page));
 565        clear_bit_unlock(PG_locked, &page->flags);
 566        smp_mb__after_clear_bit();
 567        wake_up_page(page, PG_locked);
 568}
 569EXPORT_SYMBOL(unlock_page);
 570
 571/**
 572 * end_page_writeback - end writeback against a page
 573 * @page: the page
 574 */
 575void end_page_writeback(struct page *page)
 576{
 577        if (TestClearPageReclaim(page))
 578                rotate_reclaimable_page(page);
 579
 580        if (!test_clear_page_writeback(page))
 581                BUG();
 582
 583        smp_mb__after_clear_bit();
 584        wake_up_page(page, PG_writeback);
 585}
 586EXPORT_SYMBOL(end_page_writeback);
 587
 588/**
 589 * __lock_page - get a lock on the page, assuming we need to sleep to get it
 590 * @page: the page to lock
 591 *
 592 * Ugly. Running sync_page() in state TASK_UNINTERRUPTIBLE is scary.  If some
 593 * random driver's requestfn sets TASK_RUNNING, we could busywait.  However
 594 * chances are that on the second loop, the block layer's plug list is empty,
 595 * so sync_page() will then return in state TASK_UNINTERRUPTIBLE.
 596 */
 597void __lock_page(struct page *page)
 598{
 599        DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
 600
 601        __wait_on_bit_lock(page_waitqueue(page), &wait, sync_page,
 602                                                        TASK_UNINTERRUPTIBLE);
 603}
 604EXPORT_SYMBOL(__lock_page);
 605
 606int __lock_page_killable(struct page *page)
 607{
 608        DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
 609
 610        return __wait_on_bit_lock(page_waitqueue(page), &wait,
 611                                        sync_page_killable, TASK_KILLABLE);
 612}
 613EXPORT_SYMBOL_GPL(__lock_page_killable);
 614
 615/**
 616 * __lock_page_nosync - get a lock on the page, without calling sync_page()
 617 * @page: the page to lock
 618 *
 619 * Variant of lock_page that does not require the caller to hold a reference
 620 * on the page's mapping.
 621 */
 622void __lock_page_nosync(struct page *page)
 623{
 624        DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
 625        __wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock,
 626                                                        TASK_UNINTERRUPTIBLE);
 627}
 628
 629/**
 630 * find_get_page - find and get a page reference
 631 * @mapping: the address_space to search
 632 * @offset: the page index
 633 *
 634 * Is there a pagecache struct page at the given (mapping, offset) tuple?
 635 * If yes, increment its refcount and return it; if no, return NULL.
 636 */
 637struct page *find_get_page(struct address_space *mapping, pgoff_t offset)
 638{
 639        void **pagep;
 640        struct page *page;
 641
 642        rcu_read_lock();
 643repeat:
 644        page = NULL;
 645        pagep = radix_tree_lookup_slot(&mapping->page_tree, offset);
 646        if (pagep) {
 647                page = radix_tree_deref_slot(pagep);
 648                if (unlikely(!page || page == RADIX_TREE_RETRY))
 649                        goto repeat;
 650
 651                if (!page_cache_get_speculative(page))
 652                        goto repeat;
 653
 654                /*
 655                 * Has the page moved?
 656                 * This is part of the lockless pagecache protocol. See
 657                 * include/linux/pagemap.h for details.
 658                 */
 659                if (unlikely(page != *pagep)) {
 660                        page_cache_release(page);
 661                        goto repeat;
 662                }
 663        }
 664        rcu_read_unlock();
 665
 666        return page;
 667}
 668EXPORT_SYMBOL(find_get_page);
 669
 670/**
 671 * find_lock_page - locate, pin and lock a pagecache page
 672 * @mapping: the address_space to search
 673 * @offset: the page index
 674 *
 675 * Locates the desired pagecache page, locks it, increments its reference
 676 * count and returns its address.
 677 *
 678 * Returns zero if the page was not present. find_lock_page() may sleep.
 679 */
 680struct page *find_lock_page(struct address_space *mapping, pgoff_t offset)
 681{
 682        struct page *page;
 683
 684repeat:
 685        page = find_get_page(mapping, offset);
 686        if (page) {
 687                lock_page(page);
 688                /* Has the page been truncated? */
 689                if (unlikely(page->mapping != mapping)) {
 690                        unlock_page(page);
 691                        page_cache_release(page);
 692                        goto repeat;
 693                }
 694                VM_BUG_ON(page->index != offset);
 695        }
 696        return page;
 697}
 698EXPORT_SYMBOL(find_lock_page);
 699
 700/**
 701 * find_or_create_page - locate or add a pagecache page
 702 * @mapping: the page's address_space
 703 * @index: the page's index into the mapping
 704 * @gfp_mask: page allocation mode
 705 *
 706 * Locates a page in the pagecache.  If the page is not present, a new page
 707 * is allocated using @gfp_mask and is added to the pagecache and to the VM's
 708 * LRU list.  The returned page is locked and has its reference count
 709 * incremented.
 710 *
 711 * find_or_create_page() may sleep, even if @gfp_flags specifies an atomic
 712 * allocation!
 713 *
 714 * find_or_create_page() returns the desired page's address, or zero on
 715 * memory exhaustion.
 716 */
 717struct page *find_or_create_page(struct address_space *mapping,
 718                pgoff_t index, gfp_t gfp_mask)
 719{
 720        struct page *page;
 721        int err;
 722repeat:
 723        page = find_lock_page(mapping, index);
 724        if (!page) {
 725                page = __page_cache_alloc(gfp_mask);
 726                if (!page)
 727                        return NULL;
 728                /*
 729                 * We want a regular kernel memory (not highmem or DMA etc)
 730                 * allocation for the radix tree nodes, but we need to honour
 731                 * the context-specific requirements the caller has asked for.
 732                 * GFP_RECLAIM_MASK collects those requirements.
 733                 */
 734                err = add_to_page_cache_lru(page, mapping, index,
 735                        (gfp_mask & GFP_RECLAIM_MASK));
 736                if (unlikely(err)) {
 737                        page_cache_release(page);
 738                        page = NULL;
 739                        if (err == -EEXIST)
 740                                goto repeat;
 741                }
 742        }
 743        return page;
 744}
 745EXPORT_SYMBOL(find_or_create_page);
 746
 747/**
 748 * find_get_pages - gang pagecache lookup
 749 * @mapping:    The address_space to search
 750 * @start:      The starting page index
 751 * @nr_pages:   The maximum number of pages
 752 * @pages:      Where the resulting pages are placed
 753 *
 754 * find_get_pages() will search for and return a group of up to
 755 * @nr_pages pages in the mapping.  The pages are placed at @pages.
 756 * find_get_pages() takes a reference against the returned pages.
 757 *
 758 * The search returns a group of mapping-contiguous pages with ascending
 759 * indexes.  There may be holes in the indices due to not-present pages.
 760 *
 761 * find_get_pages() returns the number of pages which were found.
 762 */
 763unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
 764                            unsigned int nr_pages, struct page **pages)
 765{
 766        unsigned int i;
 767        unsigned int ret;
 768        unsigned int nr_found;
 769
 770        rcu_read_lock();
 771restart:
 772        nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
 773                                (void ***)pages, start, nr_pages);
 774        ret = 0;
 775        for (i = 0; i < nr_found; i++) {
 776                struct page *page;
 777repeat:
 778                page = radix_tree_deref_slot((void **)pages[i]);
 779                if (unlikely(!page))
 780                        continue;
 781                /*
 782                 * this can only trigger if nr_found == 1, making livelock
 783                 * a non issue.
 784                 */
 785                if (unlikely(page == RADIX_TREE_RETRY))
 786                        goto restart;
 787
 788                if (!page_cache_get_speculative(page))
 789                        goto repeat;
 790
 791                /* Has the page moved? */
 792                if (unlikely(page != *((void **)pages[i]))) {
 793                        page_cache_release(page);
 794                        goto repeat;
 795                }
 796
 797                pages[ret] = page;
 798                ret++;
 799        }
 800        rcu_read_unlock();
 801        return ret;
 802}
 803
 804/**
 805 * find_get_pages_contig - gang contiguous pagecache lookup
 806 * @mapping:    The address_space to search
 807 * @index:      The starting page index
 808 * @nr_pages:   The maximum number of pages
 809 * @pages:      Where the resulting pages are placed
 810 *
 811 * find_get_pages_contig() works exactly like find_get_pages(), except
 812 * that the returned number of pages are guaranteed to be contiguous.
 813 *
 814 * find_get_pages_contig() returns the number of pages which were found.
 815 */
 816unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
 817                               unsigned int nr_pages, struct page **pages)
 818{
 819        unsigned int i;
 820        unsigned int ret;
 821        unsigned int nr_found;
 822
 823        rcu_read_lock();
 824restart:
 825        nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
 826                                (void ***)pages, index, nr_pages);
 827        ret = 0;
 828        for (i = 0; i < nr_found; i++) {
 829                struct page *page;
 830repeat:
 831                page = radix_tree_deref_slot((void **)pages[i]);
 832                if (unlikely(!page))
 833                        continue;
 834                /*
 835                 * this can only trigger if nr_found == 1, making livelock
 836                 * a non issue.
 837                 */
 838                if (unlikely(page == RADIX_TREE_RETRY))
 839                        goto restart;
 840
 841                if (page->mapping == NULL || page->index != index)
 842                        break;
 843
 844                if (!page_cache_get_speculative(page))
 845                        goto repeat;
 846
 847                /* Has the page moved? */
 848                if (unlikely(page != *((void **)pages[i]))) {
 849                        page_cache_release(page);
 850                        goto repeat;
 851                }
 852
 853                pages[ret] = page;
 854                ret++;
 855                index++;
 856        }
 857        rcu_read_unlock();
 858        return ret;
 859}
 860EXPORT_SYMBOL(find_get_pages_contig);
 861
 862/**
 863 * find_get_pages_tag - find and return pages that match @tag
 864 * @mapping:    the address_space to search
 865 * @index:      the starting page index
 866 * @tag:        the tag index
 867 * @nr_pages:   the maximum number of pages
 868 * @pages:      where the resulting pages are placed
 869 *
 870 * Like find_get_pages, except we only return pages which are tagged with
 871 * @tag.   We update @index to index the next page for the traversal.
 872 */
 873unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
 874                        int tag, unsigned int nr_pages, struct page **pages)
 875{
 876        unsigned int i;
 877        unsigned int ret;
 878        unsigned int nr_found;
 879
 880        rcu_read_lock();
 881restart:
 882        nr_found = radix_tree_gang_lookup_tag_slot(&mapping->page_tree,
 883                                (void ***)pages, *index, nr_pages, tag);
 884        ret = 0;
 885        for (i = 0; i < nr_found; i++) {
 886                struct page *page;
 887repeat:
 888                page = radix_tree_deref_slot((void **)pages[i]);
 889                if (unlikely(!page))
 890                        continue;
 891                /*
 892                 * this can only trigger if nr_found == 1, making livelock
 893                 * a non issue.
 894                 */
 895                if (unlikely(page == RADIX_TREE_RETRY))
 896                        goto restart;
 897
 898                if (!page_cache_get_speculative(page))
 899                        goto repeat;
 900
 901                /* Has the page moved? */
 902                if (unlikely(page != *((void **)pages[i]))) {
 903                        page_cache_release(page);
 904                        goto repeat;
 905                }
 906
 907                pages[ret] = page;
 908                ret++;
 909        }
 910        rcu_read_unlock();
 911
 912        if (ret)
 913                *index = pages[ret - 1]->index + 1;
 914
 915        return ret;
 916}
 917EXPORT_SYMBOL(find_get_pages_tag);
 918
 919/**
 920 * grab_cache_page_nowait - returns locked page at given index in given cache
 921 * @mapping: target address_space
 922 * @index: the page index
 923 *
 924 * Same as grab_cache_page(), but do not wait if the page is unavailable.
 925 * This is intended for speculative data generators, where the data can
 926 * be regenerated if the page couldn't be grabbed.  This routine should
 927 * be safe to call while holding the lock for another page.
 928 *
 929 * Clear __GFP_FS when allocating the page to avoid recursion into the fs
 930 * and deadlock against the caller's locked page.
 931 */
 932struct page *
 933grab_cache_page_nowait(struct address_space *mapping, pgoff_t index)
 934{
 935        struct page *page = find_get_page(mapping, index);
 936
 937        if (page) {
 938                if (trylock_page(page))
 939                        return page;
 940                page_cache_release(page);
 941                return NULL;
 942        }
 943        page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS);
 944        if (page && add_to_page_cache_lru(page, mapping, index, GFP_NOFS)) {
 945                page_cache_release(page);
 946                page = NULL;
 947        }
 948        return page;
 949}
 950EXPORT_SYMBOL(grab_cache_page_nowait);
 951
 952/*
 953 * CD/DVDs are error prone. When a medium error occurs, the driver may fail
 954 * a _large_ part of the i/o request. Imagine the worst scenario:
 955 *
 956 *      ---R__________________________________________B__________
 957 *         ^ reading here                             ^ bad block(assume 4k)
 958 *
 959 * read(R) => miss => readahead(R...B) => media error => frustrating retries
 960 * => failing the whole request => read(R) => read(R+1) =>
 961 * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) =>
 962 * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) =>
 963 * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ......
 964 *
 965 * It is going insane. Fix it by quickly scaling down the readahead size.
 966 */
 967static void shrink_readahead_size_eio(struct file *filp,
 968                                        struct file_ra_state *ra)
 969{
 970        ra->ra_pages /= 4;
 971}
 972
 973/**
 974 * do_generic_file_read - generic file read routine
 975 * @filp:       the file to read
 976 * @ppos:       current file position
 977 * @desc:       read_descriptor
 978 * @actor:      read method
 979 *
 980 * This is a generic file read routine, and uses the
 981 * mapping->a_ops->readpage() function for the actual low-level stuff.
 982 *
 983 * This is really ugly. But the goto's actually try to clarify some
 984 * of the logic when it comes to error handling etc.
 985 */
 986static void do_generic_file_read(struct file *filp, loff_t *ppos,
 987                read_descriptor_t *desc, read_actor_t actor)
 988{
 989        struct address_space *mapping = filp->f_mapping;
 990        struct inode *inode = mapping->host;
 991        struct file_ra_state *ra = &filp->f_ra;
 992        pgoff_t index;
 993        pgoff_t last_index;
 994        pgoff_t prev_index;
 995        unsigned long offset;      /* offset into pagecache page */
 996        unsigned int prev_offset;
 997        int error;
 998
 999        index = *ppos >> PAGE_CACHE_SHIFT;
1000        prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT;

1001        prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1);
1002        last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
1003        offset = *ppos & ~PAGE_CACHE_MASK;
1004
1005        for (;;) {
1006                struct page *page;
1007                pgoff_t end_index;
1008                loff_t isize;
1009                unsigned long nr, ret;
1010
1011                cond_resched();
1012find_page:
1013                page = find_get_page(mapping, index);
1014                if (!page) {
1015                        page_cache_sync_readahead(mapping,
1016                                        ra, filp,
1017                                        index, last_index - index);
1018                        page = find_get_page(mapping, index);
1019                        if (unlikely(page == NULL))
1020                                goto no_cached_page;
1021                }
1022                if (PageReadahead(page)) {
1023                        page_cache_async_readahead(mapping,
1024                                        ra, filp, page,
1025                                        index, last_index - index);
1026                }
1027                if (!PageUptodate(page)) {
1028                        if (inode->i_blkbits == PAGE_CACHE_SHIFT ||
1029                                        !mapping->a_ops->is_partially_uptodate)
1030                                goto page_not_up_to_date;
1031                        if (!trylock_page(page))
1032                                goto page_not_up_to_date;
1033                        if (!mapping->a_ops->is_partially_uptodate(page,
1034                                                                desc, offset))
1035                                goto page_not_up_to_date_locked;
1036                        unlock_page(page);
1037                }
1038page_ok:
1039                /*
1040                 * i_size must be checked after we know the page is Uptodate.
1041                 *
1042                 * Checking i_size after the check allows us to calculate
1043                 * the correct value for "nr", which means the zero-filled
1044                 * part of the page is not copied back to userspace (unless
1045                 * another truncate extends the file - this is desired though).
1046                 */
1047
1048                isize = i_size_read(inode);
1049                end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
1050                if (unlikely(!isize || index > end_index)) {
1051                        page_cache_release(page);
1052                        goto out;
1053                }
1054
1055                /* nr is the maximum number of bytes to copy from this page */
1056                nr = PAGE_CACHE_SIZE;
1057                if (index == end_index) {
1058                        nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
1059                        if (nr <= offset) {
1060                                page_cache_release(page);
1061                                goto out;
1062                        }
1063                }
1064                nr = nr - offset;
1065
1066                /* If users can be writing to this page using arbitrary
1067                 * virtual addresses, take care about potential aliasing
1068                 * before reading the page on the kernel side.
1069                 */
1070                if (mapping_writably_mapped(mapping))
1071                        flush_dcache_page(page);
1072
1073                /*
1074                 * When a sequential read accesses a page several times,
1075                 * only mark it as accessed the first time.
1076                 */
1077                if (prev_index != index || offset != prev_offset)
1078                        mark_page_accessed(page);
1079                prev_index = index;
1080
1081                /*
1082                 * Ok, we have the page, and it's up-to-date, so
1083                 * now we can copy it to user space...
1084                 *
1085                 * The actor routine returns how many bytes were actually used..
1086                 * NOTE! This may not be the same as how much of a user buffer
1087                 * we filled up (we may be padding etc), so we can only update
1088                 * "pos" here (the actor routine has to update the user buffer
1089                 * pointers and the remaining count).
1090                 */
1091                ret = actor(desc, page, offset, nr);
1092                offset += ret;
1093                index += offset >> PAGE_CACHE_SHIFT;
1094                offset &= ~PAGE_CACHE_MASK;
1095                prev_offset = offset;
1096
1097                page_cache_release(page);
1098                if (ret == nr && desc->count)
1099                        continue;
1100                goto out;
1101
1102page_not_up_to_date:
1103                /* Get exclusive access to the page ... */
1104                error = lock_page_killable(page);
1105                if (unlikely(error))
1106                        goto readpage_error;
1107
1108page_not_up_to_date_locked:
1109                /* Did it get truncated before we got the lock? */
1110                if (!page->mapping) {
1111                        unlock_page(page);
1112                        page_cache_release(page);
1113                        continue;
1114                }
1115
1116                /* Did somebody else fill it already? */
1117                if (PageUptodate(page)) {
1118                        unlock_page(page);
1119                        goto page_ok;
1120                }
1121
1122readpage:
1123                /* Start the actual read. The read will unlock the page. */
1124                error = mapping->a_ops->readpage(filp, page);
1125
1126                if (unlikely(error)) {
1127                        if (error == AOP_TRUNCATED_PAGE) {
1128                                page_cache_release(page);
1129                                goto find_page;
1130                        }
1131                        goto readpage_error;
1132                }
1133
1134                if (!PageUptodate(page)) {
1135                        error = lock_page_killable(page);
1136                        if (unlikely(error))
1137                                goto readpage_error;
1138                        if (!PageUptodate(page)) {
1139                                if (page->mapping == NULL) {
1140                                        /*
1141                                         * invalidate_inode_pages got it
1142                                         */
1143                                        unlock_page(page);
1144                                        page_cache_release(page);
1145                                        goto find_page;
1146                                }
1147                                unlock_page(page);
1148                                shrink_readahead_size_eio(filp, ra);
1149                                error = -EIO;
1150                                goto readpage_error;
1151                        }
1152                        unlock_page(page);
1153                }
1154
1155                goto page_ok;
1156
1157readpage_error:
1158                /* UHHUH! A synchronous read error occurred. Report it */
1159                desc->error = error;
1160                page_cache_release(page);
1161                goto out;
1162
1163no_cached_page:
1164                /*
1165                 * Ok, it wasn't cached, so we need to create a new
1166                 * page..
1167                 */
1168                page = page_cache_alloc_cold(mapping);
1169                if (!page) {
1170                        desc->error = -ENOMEM;
1171                        goto out;
1172                }
1173                error = add_to_page_cache_lru(page, mapping,
1174                                                index, GFP_KERNEL);
1175                if (error) {
1176                        page_cache_release(page);
1177                        if (error == -EEXIST)
1178                                goto find_page;
1179                        desc->error = error;
1180                        goto out;
1181                }
1182                goto readpage;
1183        }
1184
1185out:
1186        ra->prev_pos = prev_index;
1187        ra->prev_pos <<= PAGE_CACHE_SHIFT;
1188        ra->prev_pos |= prev_offset;
1189
1190        *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset;
1191        file_accessed(filp);
1192}
1193
1194int file_read_actor(read_descriptor_t *desc, struct page *page,
1195                        unsigned long offset, unsigned long size)
1196{
1197        char *kaddr;
1198        unsigned long left, count = desc->count;
1199
1200        if (size > count)
1201                size = count;
1202
1203        /*
1204         * Faults on the destination of a read are common, so do it before
1205         * taking the kmap.
1206         */
1207        if (!fault_in_pages_writeable(desc->arg.buf, size)) {
1208                kaddr = kmap_atomic(page, KM_USER0);
1209                left = __copy_to_user_inatomic(desc->arg.buf,
1210                                                kaddr + offset, size);
1211                kunmap_atomic(kaddr, KM_USER0);
1212                if (left == 0)
1213                        goto success;
1214        }
1215
1216        /* Do it the slow way */
1217        kaddr = kmap(page);
1218        left = __copy_to_user(desc->arg.buf, kaddr + offset, size);
1219        kunmap(page);
1220
1221        if (left) {
1222                size -= left;
1223                desc->error = -EFAULT;
1224        }
1225success:
1226        desc->count = count - size;
1227        desc->written += size;
1228        desc->arg.buf += size;
1229        return size;
1230}
1231
1232/*
1233 * Performs necessary checks before doing a write
1234 * @iov:        io vector request
1235 * @nr_segs:    number of segments in the iovec
1236 * @count:      number of bytes to write
1237 * @access_flags: type of access: %VERIFY_READ or %VERIFY_WRITE
1238 *
1239 * Adjust number of segments and amount of bytes to write (nr_segs should be
1240 * properly initialized first). Returns appropriate error code that caller
1241 * should return or zero in case that write should be allowed.
1242 */
1243int generic_segment_checks(const struct iovec *iov,
1244                        unsigned long *nr_segs, size_t *count, int access_flags)
1245{
1246        unsigned long   seg;
1247        size_t cnt = 0;
1248        for (seg = 0; seg < *nr_segs; seg++) {
1249                const struct iovec *iv = &iov[seg];
1250
1251                /*
1252                 * If any segment has a negative length, or the cumulative
1253                 * length ever wraps negative then return -EINVAL.
1254                 */
1255                cnt += iv->iov_len;
1256                if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1257                        return -EINVAL;
1258                if (access_ok(access_flags, iv->iov_base, iv->iov_len))
1259                        continue;
1260                if (seg == 0)
1261                        return -EFAULT;
1262                *nr_segs = seg;
1263                cnt -= iv->iov_len;     /* This segment is no good */
1264                break;
1265        }
1266        *count = cnt;
1267        return 0;
1268}
1269EXPORT_SYMBOL(generic_segment_checks);
1270
1271/**
1272 * generic_file_aio_read - generic filesystem read routine
1273 * @iocb:       kernel I/O control block
1274 * @iov:        io vector request
1275 * @nr_segs:    number of segments in the iovec
1276 * @pos:        current file position
1277 *
1278 * This is the "read()" routine for all filesystems
1279 * that can use the page cache directly.
1280 */
1281ssize_t
1282generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1283                unsigned long nr_segs, loff_t pos)
1284{
1285        struct file *filp = iocb->ki_filp;
1286        ssize_t retval;
1287        unsigned long seg;
1288        size_t count;
1289        loff_t *ppos = &iocb->ki_pos;
1290
1291        count = 0;
1292        retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
1293        if (retval)
1294                return retval;
1295
1296        /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
1297        if (filp->f_flags & O_DIRECT) {
1298                loff_t size;
1299                struct address_space *mapping;
1300                struct inode *inode;
1301
1302                mapping = filp->f_mapping;
1303                inode = mapping->host;
1304                if (!count)
1305                        goto out; /* skip atime */
1306                size = i_size_read(inode);
1307                if (pos < size) {
1308                        retval = filemap_write_and_wait_range(mapping, pos,
1309                                        pos + iov_length(iov, nr_segs) - 1);
1310                        if (!retval) {
1311                                retval = mapping->a_ops->direct_IO(READ, iocb,
1312                                                        iov, pos, nr_segs);
1313                        }
1314                        if (retval > 0)
1315                                *ppos = pos + retval;
1316                        if (retval) {
1317                                file_accessed(filp);
1318                                goto out;
1319                        }
1320                }
1321        }
1322
1323        for (seg = 0; seg < nr_segs; seg++) {
1324                read_descriptor_t desc;
1325
1326                desc.written = 0;
1327                desc.arg.buf = iov[seg].iov_base;
1328                desc.count = iov[seg].iov_len;
1329                if (desc.count == 0)
1330                        continue;
1331                desc.error = 0;
1332                do_generic_file_read(filp, ppos, &desc, file_read_actor);
1333                retval += desc.written;
1334                if (desc.error) {
1335                        retval = retval ?: desc.error;
1336                        break;
1337                }
1338                if (desc.count > 0)
1339                        break;
1340        }
1341out:
1342        return retval;
1343}
1344EXPORT_SYMBOL(generic_file_aio_read);
1345
1346static ssize_t
1347do_readahead(struct address_space *mapping, struct file *filp,
1348             pgoff_t index, unsigned long nr)
1349{
1350        if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
1351                return -EINVAL;
1352
1353        force_page_cache_readahead(mapping, filp, index, nr);
1354        return 0;
1355}
1356
1357SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count)
1358{
1359        ssize_t ret;
1360        struct file *file;
1361
1362        ret = -EBADF;
1363        file = fget(fd);
1364        if (file) {
1365                if (file->f_mode & FMODE_READ) {
1366                        struct address_space *mapping = file->f_mapping;
1367                        pgoff_t start = offset >> PAGE_CACHE_SHIFT;
1368                        pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
1369                        unsigned long len = end - start + 1;
1370                        ret = do_readahead(mapping, file, start, len);
1371                }
1372                fput(file);
1373        }
1374        return ret;
1375}
1376#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
1377asmlinkage long SyS_readahead(long fd, loff_t offset, long count)
1378{
1379        return SYSC_readahead((int) fd, offset, (size_t) count);
1380}
1381SYSCALL_ALIAS(sys_readahead, SyS_readahead);
1382#endif
1383
1384#ifdef CONFIG_MMU
1385/**
1386 * page_cache_read - adds requested page to the page cache if not already there
1387 * @file:       file to read
1388 * @offset:     page index
1389 *
1390 * This adds the requested page to the page cache if it isn't already there,
1391 * and schedules an I/O to read in its contents from disk.
1392 */
1393static int page_cache_read(struct file *file, pgoff_t offset)
1394{
1395        struct address_space *mapping = file->f_mapping;
1396        struct page *page; 
1397        int ret;
1398
1399        do {
1400                page = page_cache_alloc_cold(mapping);
1401                if (!page)
1402                        return -ENOMEM;
1403
1404                ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
1405                if (ret == 0)
1406                        ret = mapping->a_ops->readpage(file, page);
1407                else if (ret == -EEXIST)
1408                        ret = 0; /* losing race to add is OK */
1409
1410                page_cache_release(page);
1411
1412        } while (ret == AOP_TRUNCATED_PAGE);
1413                
1414        return ret;
1415}
1416
1417#define MMAP_LOTSAMISS  (100)
1418
1419/*
1420 * Synchronous readahead happens when we don't even find
1421 * a page in the page cache at all.
1422 */
1423static void do_sync_mmap_readahead(struct vm_area_struct *vma,
1424                                   struct file_ra_state *ra,
1425                                   struct file *file,
1426                                   pgoff_t offset)
1427{
1428        unsigned long ra_pages;
1429        struct address_space *mapping = file->f_mapping;
1430
1431        /* If we don't want any read-ahead, don't bother */
1432        if (VM_RandomReadHint(vma))
1433                return;
1434
1435        if (VM_SequentialReadHint(vma) ||
1436                        offset - 1 == (ra->prev_pos >> PAGE_CACHE_SHIFT)) {
1437                page_cache_sync_readahead(mapping, ra, file, offset,
1438                                          ra->ra_pages);
1439                return;
1440        }
1441
1442        if (ra->mmap_miss < INT_MAX)
1443                ra->mmap_miss++;
1444
1445        /*
1446         * Do we miss much more than hit in this file? If so,
1447         * stop bothering with read-ahead. It will only hurt.
1448         */
1449        if (ra->mmap_miss > MMAP_LOTSAMISS)
1450                return;
1451
1452        /*
1453         * mmap read-around
1454         */
1455        ra_pages = max_sane_readahead(ra->ra_pages);
1456        if (ra_pages) {
1457                ra->start = max_t(long, 0, offset - ra_pages/2);
1458                ra->size = ra_pages;
1459                ra->async_size = 0;
1460                ra_submit(ra, mapping, file);
1461        }
1462}
1463
1464/*
1465 * Asynchronous readahead happens when we find the page and PG_readahead,
1466 * so we want to possibly extend the readahead further..
1467 */
1468static void do_async_mmap_readahead(struct vm_area_struct *vma,
1469                                    struct file_ra_state *ra,
1470                                    struct file *file,
1471                                    struct page *page,
1472                                    pgoff_t offset)
1473{
1474        struct address_space *mapping = file->f_mapping;
1475
1476        /* If we don't want any read-ahead, don't bother */
1477        if (VM_RandomReadHint(vma))
1478                return;
1479        if (ra->mmap_miss > 0)
1480                ra->mmap_miss--;
1481        if (PageReadahead(page))
1482                page_cache_async_readahead(mapping, ra, file,
1483                                           page, offset, ra->ra_pages);
1484}
1485
1486/**
1487 * filemap_fault - read in file data for page fault handling
1488 * @vma:        vma in which the fault was taken
1489 * @vmf:        struct vm_fault containing details of the fault
1490 *
1491 * filemap_fault() is invoked via the vma operations vector for a
1492 * mapped memory region to read in file data during a page fault.
1493 *
1494 * The goto's are kind of ugly, but this streamlines the normal case of having
1495 * it in the page cache, and handles the special cases reasonably without
1496 * having a lot of duplicated code.
1497 */
1498int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1499{
1500        int error;
1501        struct file *file = vma->vm_file;
1502        struct address_space *mapping = file->f_mapping;
1503        struct file_ra_state *ra = &file->f_ra;
1504        struct inode *inode = mapping->host;
1505        pgoff_t offset = vmf->pgoff;
1506        struct page *page;
1507        pgoff_t size;
1508        int ret = 0;
1509
1510        size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1511        if (offset >= size)
1512                return VM_FAULT_SIGBUS;
1513
1514        /*
1515         * Do we have something in the page cache already?
1516         */
1517        page = find_get_page(mapping, offset);
1518        if (likely(page)) {
1519                /*
1520                 * We found the page, so try async readahead before
1521                 * waiting for the lock.
1522                 */
1523                do_async_mmap_readahead(vma, ra, file, page, offset);
1524                lock_page(page);
1525
1526                /* Did it get truncated? */
1527                if (unlikely(page->mapping != mapping)) {
1528                        unlock_page(page);
1529                        put_page(page);
1530                        goto no_cached_page;
1531                }
1532        } else {
1533                /* No page in the page cache at all */
1534                do_sync_mmap_readahead(vma, ra, file, offset);
1535                count_vm_event(PGMAJFAULT);
1536                ret = VM_FAULT_MAJOR;
1537retry_find:
1538                page = find_lock_page(mapping, offset);
1539                if (!page)
1540                        goto no_cached_page;
1541        }
1542
1543        /*
1544         * We have a locked page in the page cache, now we need to check
1545         * that it's up-to-date. If not, it is going to be due to an error.
1546         */
1547        if (unlikely(!PageUptodate(page)))
1548                goto page_not_uptodate;
1549
1550        /*
1551         * Found the page and have a reference on it.
1552         * We must recheck i_size under page lock.
1553         */
1554        size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1555        if (unlikely(offset >= size)) {
1556                unlock_page(page);
1557                page_cache_release(page);
1558                return VM_FAULT_SIGBUS;
1559        }
1560
1561        ra->prev_pos = (loff_t)offset << PAGE_CACHE_SHIFT;
1562        vmf->page = page;
1563        return ret | VM_FAULT_LOCKED;
1564
1565no_cached_page:
1566        /*
1567         * We're only likely to ever get here if MADV_RANDOM is in
1568         * effect.
1569         */
1570        error = page_cache_read(file, offset);
1571
1572        /*
1573         * The page we want has now been added to the page cache.
1574         * In the unlikely event that someone removed it in the
1575         * meantime, we'll just come back here and read it again.
1576         */
1577        if (error >= 0)
1578                goto retry_find;
1579
1580        /*
1581         * An error return from page_cache_read can result if the
1582         * system is low on memory, or a problem occurs while trying
1583         * to schedule I/O.
1584         */
1585        if (error == -ENOMEM)
1586                return VM_FAULT_OOM;
1587        return VM_FAULT_SIGBUS;
1588
1589page_not_uptodate:
1590        /*
1591         * Umm, take care of errors if the page isn't up-to-date.
1592         * Try to re-read it _once_. We do this synchronously,
1593         * because there really aren't any performance issues here
1594         * and we need to check for errors.
1595         */
1596        ClearPageError(page);
1597        error = mapping->a_ops->readpage(file, page);
1598        if (!error) {
1599                wait_on_page_locked(page);
1600                if (!PageUptodate(page))
1601                        error = -EIO;
1602        }
1603        page_cache_release(page);
1604
1605        if (!error || error == AOP_TRUNCATED_PAGE)
1606                goto retry_find;
1607
1608        /* Things didn't work out. Return zero to tell the mm layer so. */
1609        shrink_readahead_size_eio(file, ra);
1610        return VM_FAULT_SIGBUS;
1611}
1612EXPORT_SYMBOL(filemap_fault);
1613
1614const struct vm_operations_struct generic_file_vm_ops = {
1615        .fault          = filemap_fault,
1616};
1617
1618/* This is used for a general mmap of a disk file */
1619
1620int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1621{
1622        struct address_space *mapping = file->f_mapping;
1623
1624        if (!mapping->a_ops->readpage)
1625                return -ENOEXEC;
1626        file_accessed(file);
1627        vma->vm_ops = &generic_file_vm_ops;
1628        vma->vm_flags |= VM_CAN_NONLINEAR;
1629        return 0;
1630}
1631
1632/*
1633 * This is for filesystems which do not implement ->writepage.
1634 */
1635int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
1636{
1637        if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
1638                return -EINVAL;
1639        return generic_file_mmap(file, vma);
1640}
1641#else
1642int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1643{
1644        return -ENOSYS;
1645}
1646int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma)
1647{
1648        return -ENOSYS;
1649}
1650#endif /* CONFIG_MMU */
1651
1652EXPORT_SYMBOL(generic_file_mmap);
1653EXPORT_SYMBOL(generic_file_readonly_mmap);
1654
1655static struct page *__read_cache_page(struct address_space *mapping,
1656                                pgoff_t index,
1657                                int (*filler)(void *,struct page*),
1658                                void *data)
1659{
1660        struct page *page;
1661        int err;
1662repeat:
1663        page = find_get_page(mapping, index);
1664        if (!page) {
1665                page = page_cache_alloc_cold(mapping);
1666                if (!page)
1667                        return ERR_PTR(-ENOMEM);
1668                err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL);
1669                if (unlikely(err)) {
1670                        page_cache_release(page);
1671                        if (err == -EEXIST)
1672                                goto repeat;
1673                        /* Presumably ENOMEM for radix tree node */
1674                        return ERR_PTR(err);
1675                }
1676                err = filler(data, page);
1677                if (err < 0) {
1678                        page_cache_release(page);
1679                        page = ERR_PTR(err);
1680                }
1681        }
1682        return page;
1683}
1684
1685/**
1686 * read_cache_page_async - read into page cache, fill it if needed
1687 * @mapping:    the page's address_space
1688 * @index:      the page index
1689 * @filler:     function to perform the read
1690 * @data:       destination for read data
1691 *
1692 * Same as read_cache_page, but don't wait for page to become unlocked
1693 * after submitting it to the filler.
1694 *
1695 * Read into the page cache. If a page already exists, and PageUptodate() is
1696 * not set, try to fill the page but don't wait for it to become unlocked.
1697 *
1698 * If the page does not get brought uptodate, return -EIO.
1699 */
1700struct page *read_cache_page_async(struct address_space *mapping,
1701                                pgoff_t index,
1702                                int (*filler)(void *,struct page*),
1703                                void *data)
1704{
1705        struct page *page;
1706        int err;
1707
1708retry:
1709        page = __read_cache_page(mapping, index, filler, data);
1710        if (IS_ERR(page))
1711                return page;
1712        if (PageUptodate(page))
1713                goto out;
1714
1715        lock_page(page);
1716        if (!page->mapping) {
1717                unlock_page(page);
1718                page_cache_release(page);
1719                goto retry;
1720        }
1721        if (PageUptodate(page)) {
1722                unlock_page(page);
1723                goto out;
1724        }
1725        err = filler(data, page);
1726        if (err < 0) {
1727                page_cache_release(page);
1728                return ERR_PTR(err);
1729        }
1730out:
1731        mark_page_accessed(page);
1732        return page;
1733}
1734EXPORT_SYMBOL(read_cache_page_async);
1735
1736/**
1737 * read_cache_page - read into page cache, fill it if needed
1738 * @mapping:    the page's address_space
1739 * @index:      the page index
1740 * @filler:     function to perform the read
1741 * @data:       destination for read data
1742 *
1743 * Read into the page cache. If a page already exists, and PageUptodate() is
1744 * not set, try to fill the page then wait for it to become unlocked.
1745 *
1746 * If the page does not get brought uptodate, return -EIO.
1747 */
1748struct page *read_cache_page(struct address_space *mapping,
1749                                pgoff_t index,
1750                                int (*filler)(void *,struct page*),
1751                                void *data)
1752{
1753        struct page *page;
1754
1755        page = read_cache_page_async(mapping, index, filler, data);
1756        if (IS_ERR(page))
1757                goto out;
1758        wait_on_page_locked(page);
1759        if (!PageUptodate(page)) {
1760                page_cache_release(page);
1761                page = ERR_PTR(-EIO);
1762        }
1763 out:
1764        return page;
1765}
1766EXPORT_SYMBOL(read_cache_page);
1767
1768/*
1769 * The logic we want is
1770 *
1771 *      if suid or (sgid and xgrp)
1772 *              remove privs
1773 */
1774int should_remove_suid(struct dentry *dentry)
1775{
1776        mode_t mode = dentry->d_inode->i_mode;
1777        int kill = 0;
1778
1779        /* suid always must be killed */
1780        if (unlikely(mode & S_ISUID))
1781                kill = ATTR_KILL_SUID;
1782
1783        /*
1784         * sgid without any exec bits is just a mandatory locking mark; leave
1785         * it alone.  If some exec bits are set, it's a real sgid; kill it.
1786         */
1787        if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
1788                kill |= ATTR_KILL_SGID;
1789
1790        if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode)))
1791                return kill;
1792
1793        return 0;
1794}
1795EXPORT_SYMBOL(should_remove_suid);
1796
1797static int __remove_suid(struct dentry *dentry, int kill)
1798{
1799        struct iattr newattrs;
1800
1801        newattrs.ia_valid = ATTR_FORCE | kill;
1802        return notify_change(dentry, &newattrs);
1803}
1804
1805int file_remove_suid(struct file *file)
1806{
1807        struct dentry *dentry = file->f_path.dentry;
1808        int killsuid = should_remove_suid(dentry);
1809        int killpriv = security_inode_need_killpriv(dentry);
1810        int error = 0;
1811
1812        if (killpriv < 0)
1813                return killpriv;
1814        if (killpriv)
1815                error = security_inode_killpriv(dentry);
1816        if (!error && killsuid)
1817                error = __remove_suid(dentry, killsuid);
1818
1819        return error;
1820}
1821EXPORT_SYMBOL(file_remove_suid);
1822
1823static size_t __iovec_copy_from_user_inatomic(char *vaddr,
1824                        const struct iovec *iov, size_t base, size_t bytes)
1825{
1826        size_t copied = 0, left = 0;
1827
1828        while (bytes) {
1829                char __user *buf = iov->iov_base + base;
1830                int copy = min(bytes, iov->iov_len - base);
1831
1832                base = 0;
1833                left = __copy_from_user_inatomic(vaddr, buf, copy);
1834                copied += copy;
1835                bytes -= copy;
1836                vaddr += copy;
1837                iov++;
1838
1839                if (unlikely(left))
1840                        break;
1841        }
1842        return copied - left;
1843}
1844
1845/*
1846 * Copy as much as we can into the page and return the number of bytes which
1847 * were sucessfully copied.  If a fault is encountered then return the number of
1848 * bytes which were copied.
1849 */
1850size_t iov_iter_copy_from_user_atomic(struct page *page,
1851                struct iov_iter *i, unsigned long offset, size_t bytes)
1852{
1853        char *kaddr;
1854        size_t copied;
1855
1856        BUG_ON(!in_atomic());
1857        kaddr = kmap_atomic(page, KM_USER0);
1858        if (likely(i->nr_segs == 1)) {
1859                int left;
1860                char __user *buf = i->iov->iov_base + i->iov_offset;
1861                left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
1862                copied = bytes - left;
1863        } else {
1864                copied = __iovec_copy_from_user_inatomic(kaddr + offset,
1865                                                i->iov, i->iov_offset, bytes);
1866        }
1867        kunmap_atomic(kaddr, KM_USER0);
1868
1869        return copied;
1870}
1871EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
1872
1873/*
1874 * This has the same sideeffects and return value as
1875 * iov_iter_copy_from_user_atomic().
1876 * The difference is that it attempts to resolve faults.
1877 * Page must not be locked.
1878 */
1879size_t iov_iter_copy_from_user(struct page *page,
1880                struct iov_iter *i, unsigned long offset, size_t bytes)
1881{
1882        char *kaddr;
1883        size_t copied;
1884
1885        kaddr = kmap(page);
1886        if (likely(i->nr_segs == 1)) {
1887                int left;
1888                char __user *buf = i->iov->iov_base + i->iov_offset;
1889                left = __copy_from_user(kaddr + offset, buf, bytes);
1890                copied = bytes - left;
1891        } else {
1892                copied = __iovec_copy_from_user_inatomic(kaddr + offset,
1893                                                i->iov, i->iov_offset, bytes);
1894        }
1895        kunmap(page);
1896        return copied;
1897}
1898EXPORT_SYMBOL(iov_iter_copy_from_user);
1899
1900void iov_iter_advance(struct iov_iter *i, size_t bytes)
1901{
1902        BUG_ON(i->count < bytes);
1903
1904        if (likely(i->nr_segs == 1)) {
1905                i->iov_offset += bytes;
1906                i->count -= bytes;
1907        } else {
1908                const struct iovec *iov = i->iov;
1909                size_t base = i->iov_offset;
1910
1911                /*
1912                 * The !iov->iov_len check ensures we skip over unlikely
1913                 * zero-length segments (without overruning the iovec).
1914                 */
1915                while (bytes || unlikely(i->count && !iov->iov_len)) {
1916                        int copy;
1917
1918                        copy = min(bytes, iov->iov_len - base);
1919                        BUG_ON(!i->count || i->count < copy);
1920                        i->count -= copy;
1921                        bytes -= copy;
1922                        base += copy;
1923                        if (iov->iov_len == base) {
1924                                iov++;
1925                                base = 0;
1926                        }
1927                }
1928                i->iov = iov;
1929                i->iov_offset = base;
1930        }
1931}
1932EXPORT_SYMBOL(iov_iter_advance);
1933
1934/*
1935 * Fault in the first iovec of the given iov_iter, to a maximum length
1936 * of bytes. Returns 0 on success, or non-zero if the memory could not be
1937 * accessed (ie. because it is an invalid address).
1938 *
1939 * writev-intensive code may want this to prefault several iovecs -- that
1940 * would be possible (callers must not rely on the fact that _only_ the
1941 * first iovec will be faulted with the current implementation).
1942 */
1943int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
1944{
1945        char __user *buf = i->iov->iov_base + i->iov_offset;
1946        bytes = min(bytes, i->iov->iov_len - i->iov_offset);
1947        return fault_in_pages_readable(buf, bytes);
1948}
1949EXPORT_SYMBOL(iov_iter_fault_in_readable);
1950
1951/*
1952 * Return the count of just the current iov_iter segment.
1953 */
1954size_t iov_iter_single_seg_count(struct iov_iter *i)
1955{
1956        const struct iovec *iov = i->iov;
1957        if (i->nr_segs == 1)
1958                return i->count;
1959        else
1960                return min(i->count, iov->iov_len - i->iov_offset);
1961}
1962EXPORT_SYMBOL(iov_iter_single_seg_count);
1963
1964/*
1965 * Performs necessary checks before doing a write
1966 *
1967 * Can adjust writing position or amount of bytes to write.
1968 * Returns appropriate error code that caller should return or
1969 * zero in case that write should be allowed.
1970 */
1971inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk)
1972{
1973        struct inode *inode = file->f_mapping->host;
1974        unsigned long limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
1975
1976        if (unlikely(*pos < 0))
1977                return -EINVAL;
1978
1979        if (!isblk) {
1980                /* FIXME: this is for backwards compatibility with 2.4 */
1981                if (file->f_flags & O_APPEND)
1982                        *pos = i_size_read(inode);
1983
1984                if (limit != RLIM_INFINITY) {
1985                        if (*pos >= limit) {
1986                                send_sig(SIGXFSZ, current, 0);
1987                                return -EFBIG;
1988                        }
1989                        if (*count > limit - (typeof(limit))*pos) {
1990                                *count = limit - (typeof(limit))*pos;
1991                        }
1992                }
1993        }
1994
1995        /*
1996         * LFS rule
1997         */
1998        if (unlikely(*pos + *count > MAX_NON_LFS &&
1999                                !(file->f_flags & O_LARGEFILE))) {
2000                if (*pos >= MAX_NON_LFS) {

2001                        return -EFBIG;
2002                }
2003                if (*count > MAX_NON_LFS - (unsigned long)*pos) {
2004                        *count = MAX_NON_LFS - (unsigned long)*pos;
2005                }
2006        }
2007
2008        /*
2009         * Are we about to exceed the fs block limit ?
2010         *
2011         * If we have written data it becomes a short write.  If we have
2012         * exceeded without writing data we send a signal and return EFBIG.
2013         * Linus frestrict idea will clean these up nicely..
2014         */
2015        if (likely(!isblk)) {
2016                if (unlikely(*pos >= inode->i_sb->s_maxbytes)) {
2017                        if (*count || *pos > inode->i_sb->s_maxbytes) {
2018                                return -EFBIG;
2019                        }
2020                        /* zero-length writes at ->s_maxbytes are OK */
2021                }
2022
2023                if (unlikely(*pos + *count > inode->i_sb->s_maxbytes))
2024                        *count = inode->i_sb->s_maxbytes - *pos;
2025        } else {
2026#ifdef CONFIG_BLOCK
2027                loff_t isize;
2028                if (bdev_read_only(I_BDEV(inode)))
2029                        return -EPERM;
2030                isize = i_size_read(inode);
2031                if (*pos >= isize) {
2032                        if (*count || *pos > isize)
2033                                return -ENOSPC;
2034                }
2035
2036                if (*pos + *count > isize)
2037                        *count = isize - *pos;
2038#else
2039                return -EPERM;
2040#endif
2041        }
2042        return 0;
2043}
2044EXPORT_SYMBOL(generic_write_checks);
2045
2046int pagecache_write_begin(struct file *file, struct address_space *mapping,
2047                                loff_t pos, unsigned len, unsigned flags,
2048                                struct page **pagep, void **fsdata)
2049{
2050        const struct address_space_operations *aops = mapping->a_ops;
2051
2052        return aops->write_begin(file, mapping, pos, len, flags,
2053                                                        pagep, fsdata);
2054}
2055EXPORT_SYMBOL(pagecache_write_begin);
2056
2057int pagecache_write_end(struct file *file, struct address_space *mapping,
2058                                loff_t pos, unsigned len, unsigned copied,
2059                                struct page *page, void *fsdata)
2060{
2061        const struct address_space_operations *aops = mapping->a_ops;
2062
2063        mark_page_accessed(page);
2064        return aops->write_end(file, mapping, pos, len, copied, page, fsdata);
2065}
2066EXPORT_SYMBOL(pagecache_write_end);
2067
2068ssize_t
2069generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
2070                unsigned long *nr_segs, loff_t pos, loff_t *ppos,
2071                size_t count, size_t ocount)
2072{
2073        struct file     *file = iocb->ki_filp;
2074        struct address_space *mapping = file->f_mapping;
2075        struct inode    *inode = mapping->host;
2076        ssize_t         written;
2077        size_t          write_len;
2078        pgoff_t         end;
2079
2080        if (count != ocount)
2081                *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count);
2082
2083        write_len = iov_length(iov, *nr_segs);
2084        end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT;
2085
2086        written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1);
2087        if (written)
2088                goto out;
2089
2090        /*
2091         * After a write we want buffered reads to be sure to go to disk to get
2092         * the new data.  We invalidate clean cached page from the region we're
2093         * about to write.  We do this *before* the write so that we can return
2094         * without clobbering -EIOCBQUEUED from ->direct_IO().
2095         */
2096        if (mapping->nrpages) {
2097                written = invalidate_inode_pages2_range(mapping,
2098                                        pos >> PAGE_CACHE_SHIFT, end);
2099                /*
2100                 * If a page can not be invalidated, return 0 to fall back
2101                 * to buffered write.
2102                 */
2103                if (written) {
2104                        if (written == -EBUSY)
2105                                return 0;
2106                        goto out;
2107                }
2108        }
2109
2110        written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs);
2111
2112        /*
2113         * Finally, try again to invalidate clean pages which might have been
2114         * cached by non-direct readahead, or faulted in by get_user_pages()
2115         * if the source of the write was an mmap'ed region of the file
2116         * we're writing.  Either one is a pretty crazy thing to do,
2117         * so we don't support it 100%.  If this invalidation
2118         * fails, tough, the write still worked...
2119         */
2120        if (mapping->nrpages) {
2121                invalidate_inode_pages2_range(mapping,
2122                                              pos >> PAGE_CACHE_SHIFT, end);
2123        }
2124
2125        if (written > 0) {
2126                loff_t end = pos + written;
2127                if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
2128                        i_size_write(inode,  end);
2129                        mark_inode_dirty(inode);
2130                }
2131                *ppos = end;
2132        }
2133out:
2134        return written;
2135}
2136EXPORT_SYMBOL(generic_file_direct_write);
2137
2138/*
2139 * Find or create a page at the given pagecache position. Return the locked
2140 * page. This function is specifically for buffered writes.
2141 */
2142struct page *grab_cache_page_write_begin(struct address_space *mapping,
2143                                        pgoff_t index, unsigned flags)
2144{
2145        int status;
2146        struct page *page;
2147        gfp_t gfp_notmask = 0;
2148        if (flags & AOP_FLAG_NOFS)
2149                gfp_notmask = __GFP_FS;
2150repeat:
2151        page = find_lock_page(mapping, index);
2152        if (likely(page))
2153                return page;
2154
2155        page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask);
2156        if (!page)
2157                return NULL;
2158        status = add_to_page_cache_lru(page, mapping, index,
2159                                                GFP_KERNEL & ~gfp_notmask);
2160        if (unlikely(status)) {
2161                page_cache_release(page);
2162                if (status == -EEXIST)
2163                        goto repeat;
2164                return NULL;
2165        }
2166        return page;
2167}
2168EXPORT_SYMBOL(grab_cache_page_write_begin);
2169
2170static ssize_t generic_perform_write(struct file *file,
2171                                struct iov_iter *i, loff_t pos)
2172{
2173        struct address_space *mapping = file->f_mapping;
2174        const struct address_space_operations *a_ops = mapping->a_ops;
2175        long status = 0;
2176        ssize_t written = 0;
2177        unsigned int flags = 0;
2178
2179        /*
2180         * Copies from kernel address space cannot fail (NFSD is a big user).
2181         */
2182        if (segment_eq(get_fs(), KERNEL_DS))
2183                flags |= AOP_FLAG_UNINTERRUPTIBLE;
2184
2185        do {
2186                struct page *page;
2187                pgoff_t index;          /* Pagecache index for current page */
2188                unsigned long offset;   /* Offset into pagecache page */
2189                unsigned long bytes;    /* Bytes to write to page */
2190                size_t copied;          /* Bytes copied from user */
2191                void *fsdata;
2192
2193                offset = (pos & (PAGE_CACHE_SIZE - 1));
2194                index = pos >> PAGE_CACHE_SHIFT;
2195                bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
2196                                                iov_iter_count(i));
2197
2198again:
2199
2200                /*
2201                 * Bring in the user page that we will copy from _first_.
2202                 * Otherwise there's a nasty deadlock on copying from the
2203                 * same page as we're writing to, without it being marked
2204                 * up-to-date.
2205                 *
2206                 * Not only is this an optimisation, but it is also required
2207                 * to check that the address is actually valid, when atomic
2208                 * usercopies are used, below.
2209                 */
2210                if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
2211                        status = -EFAULT;
2212                        break;
2213                }
2214
2215                status = a_ops->write_begin(file, mapping, pos, bytes, flags,
2216                                                &page, &fsdata);
2217                if (unlikely(status))
2218                        break;
2219
2220                pagefault_disable();
2221                copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
2222                pagefault_enable();
2223                flush_dcache_page(page);
2224
2225                mark_page_accessed(page);
2226                status = a_ops->write_end(file, mapping, pos, bytes, copied,
2227                                                page, fsdata);
2228                if (unlikely(status < 0))
2229                        break;
2230                copied = status;
2231
2232                cond_resched();
2233
2234                iov_iter_advance(i, copied);
2235                if (unlikely(copied == 0)) {
2236                        /*
2237                         * If we were unable to copy any data at all, we must
2238                         * fall back to a single segment length write.
2239                         *
2240                         * If we didn't fallback here, we could livelock
2241                         * because not all segments in the iov can be copied at
2242                         * once without a pagefault.
2243                         */
2244                        bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
2245                                                iov_iter_single_seg_count(i));
2246                        goto again;
2247                }
2248                pos += copied;
2249                written += copied;
2250
2251                balance_dirty_pages_ratelimited(mapping);
2252
2253        } while (iov_iter_count(i));
2254
2255        return written ? written : status;
2256}
2257
2258ssize_t
2259generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2260                unsigned long nr_segs, loff_t pos, loff_t *ppos,
2261                size_t count, ssize_t written)
2262{
2263        struct file *file = iocb->ki_filp;
2264        struct address_space *mapping = file->f_mapping;
2265        ssize_t status;
2266        struct iov_iter i;
2267
2268        iov_iter_init(&i, iov, nr_segs, count, written);
2269        status = generic_perform_write(file, &i, pos);
2270
2271        if (likely(status >= 0)) {
2272                written += status;
2273                *ppos = pos + status;
2274        }
2275        
2276        /*
2277         * If we get here for O_DIRECT writes then we must have fallen through
2278         * to buffered writes (block instantiation inside i_size).  So we sync
2279         * the file data here, to try to honour O_DIRECT expectations.
2280         */
2281        if (unlikely(file->f_flags & O_DIRECT) && written)
2282                status = filemap_write_and_wait_range(mapping,
2283                                        pos, pos + written - 1);
2284
2285        return written ? written : status;
2286}
2287EXPORT_SYMBOL(generic_file_buffered_write);
2288
2289/**
2290 * __generic_file_aio_write - write data to a file
2291 * @iocb:       IO state structure (file, offset, etc.)
2292 * @iov:        vector with data to write
2293 * @nr_segs:    number of segments in the vector
2294 * @ppos:       position where to write
2295 *
2296 * This function does all the work needed for actually writing data to a
2297 * file. It does all basic checks, removes SUID from the file, updates
2298 * modification times and calls proper subroutines depending on whether we
2299 * do direct IO or a standard buffered write.
2300 *
2301 * It expects i_mutex to be grabbed unless we work on a block device or similar
2302 * object which does not need locking at all.
2303 *
2304 * This function does *not* take care of syncing data in case of O_SYNC write.
2305 * A caller has to handle it. This is mainly due to the fact that we want to
2306 * avoid syncing under i_mutex.
2307 */
2308ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2309                                 unsigned long nr_segs, loff_t *ppos)
2310{
2311        struct file *file = iocb->ki_filp;
2312        struct address_space * mapping = file->f_mapping;
2313        size_t ocount;          /* original count */
2314        size_t count;           /* after file limit checks */
2315        struct inode    *inode = mapping->host;
2316        loff_t          pos;
2317        ssize_t         written;
2318        ssize_t         err;
2319
2320        ocount = 0;
2321        err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
2322        if (err)
2323                return err;
2324
2325        count = ocount;
2326        pos = *ppos;
2327
2328        vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
2329
2330        /* We can write back this queue in page reclaim */
2331        current->backing_dev_info = mapping->backing_dev_info;
2332        written = 0;
2333
2334        err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
2335        if (err)
2336                goto out;
2337
2338        if (count == 0)
2339                goto out;
2340
2341        err = file_remove_suid(file);
2342        if (err)
2343                goto out;
2344
2345        file_update_time(file);
2346
2347        /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
2348        if (unlikely(file->f_flags & O_DIRECT)) {
2349                loff_t endbyte;
2350                ssize_t written_buffered;
2351
2352                written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
2353                                                        ppos, count, ocount);
2354                if (written < 0 || written == count)
2355                        goto out;
2356                /*
2357                 * direct-io write to a hole: fall through to buffered I/O
2358                 * for completing the rest of the request.
2359                 */
2360                pos += written;
2361                count -= written;
2362                written_buffered = generic_file_buffered_write(iocb, iov,
2363                                                nr_segs, pos, ppos, count,
2364                                                written);
2365                /*
2366                 * If generic_file_buffered_write() retuned a synchronous error
2367                 * then we want to return the number of bytes which were
2368                 * direct-written, or the error code if that was zero.  Note
2369                 * that this differs from normal direct-io semantics, which
2370                 * will return -EFOO even if some bytes were written.
2371                 */
2372                if (written_buffered < 0) {
2373                        err = written_buffered;
2374                        goto out;
2375                }
2376
2377                /*
2378                 * We need to ensure that the page cache pages are written to
2379                 * disk and invalidated to preserve the expected O_DIRECT
2380                 * semantics.
2381                 */
2382                endbyte = pos + written_buffered - written - 1;
2383                err = do_sync_mapping_range(file->f_mapping, pos, endbyte,
2384                                            SYNC_FILE_RANGE_WAIT_BEFORE|
2385                                            SYNC_FILE_RANGE_WRITE|
2386                                            SYNC_FILE_RANGE_WAIT_AFTER);
2387                if (err == 0) {
2388                        written = written_buffered;
2389                        invalidate_mapping_pages(mapping,
2390                                                 pos >> PAGE_CACHE_SHIFT,
2391                                                 endbyte >> PAGE_CACHE_SHIFT);
2392                } else {
2393                        /*
2394                         * We don't know how much we wrote, so just return
2395                         * the number of bytes which were direct-written
2396                         */
2397                }
2398        } else {
2399                written = generic_file_buffered_write(iocb, iov, nr_segs,
2400                                pos, ppos, count, written);
2401        }
2402out:
2403        current->backing_dev_info = NULL;
2404        return written ? written : err;
2405}
2406EXPORT_SYMBOL(__generic_file_aio_write);
2407
2408/**
2409 * generic_file_aio_write - write data to a file
2410 * @iocb:       IO state structure
2411 * @iov:        vector with data to write
2412 * @nr_segs:    number of segments in the vector
2413 * @pos:        position in file where to write
2414 *
2415 * This is a wrapper around __generic_file_aio_write() to be used by most
2416 * filesystems. It takes care of syncing the file in case of O_SYNC file
2417 * and acquires i_mutex as needed.
2418 */
2419ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2420                unsigned long nr_segs, loff_t pos)
2421{
2422        struct file *file = iocb->ki_filp;
2423        struct inode *inode = file->f_mapping->host;
2424        ssize_t ret;
2425
2426        BUG_ON(iocb->ki_pos != pos);
2427
2428        mutex_lock(&inode->i_mutex);
2429        ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
2430        mutex_unlock(&inode->i_mutex);
2431
2432        if (ret > 0 || ret == -EIOCBQUEUED) {
2433                ssize_t err;
2434
2435                err = generic_write_sync(file, pos, ret);
2436                if (err < 0 && ret > 0)
2437                        ret = err;
2438        }
2439        return ret;
2440}
2441EXPORT_SYMBOL(generic_file_aio_write);
2442
2443/**
2444 * try_to_release_page() - release old fs-specific metadata on a page
2445 *
2446 * @page: the page which the kernel is trying to free
2447 * @gfp_mask: memory allocation flags (and I/O mode)
2448 *
2449 * The address_space is to try to release any data against the page
2450 * (presumably at page->private).  If the release was successful, return `1'.
2451 * Otherwise return zero.
2452 *
2453 * This may also be called if PG_fscache is set on a page, indicating that the
2454 * page is known to the local caching routines.
2455 *
2456 * The @gfp_mask argument specifies whether I/O may be performed to release
2457 * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS).
2458 *
2459 */
2460int try_to_release_page(struct page *page, gfp_t gfp_mask)
2461{
2462        struct address_space * const mapping = page->mapping;
2463
2464        BUG_ON(!PageLocked(page));
2465        if (PageWriteback(page))
2466                return 0;
2467
2468        if (mapping && mapping->a_ops->releasepage)
2469                return mapping->a_ops->releasepage(page, gfp_mask);
2470        return try_to_free_buffers(page);
2471}
2472
2473EXPORT_SYMBOL(try_to_release_page);
2474