linux/mm/shmem.c
<<
>>
Prefs
   1/*
   2 * Resizable virtual memory filesystem for Linux.
   3 *
   4 * Copyright (C) 2000 Linus Torvalds.
   5 *               2000 Transmeta Corp.
   6 *               2000-2001 Christoph Rohland
   7 *               2000-2001 SAP AG
   8 *               2002 Red Hat Inc.
   9 * Copyright (C) 2002-2005 Hugh Dickins.
  10 * Copyright (C) 2002-2005 VERITAS Software Corporation.
  11 * Copyright (C) 2004 Andi Kleen, SuSE Labs
  12 *
  13 * Extended attribute support for tmpfs:
  14 * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net>
  15 * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
  16 *
  17 * This file is released under the GPL.
  18 */
  19
  20/*
  21 * This virtual memory filesystem is heavily based on the ramfs. It
  22 * extends ramfs by the ability to use swap and honor resource limits
  23 * which makes it a completely usable filesystem.
  24 */
  25
  26#include <linux/module.h>
  27#include <linux/init.h>
  28#include <linux/fs.h>
  29#include <linux/xattr.h>
  30#include <linux/exportfs.h>
  31#include <linux/generic_acl.h>
  32#include <linux/mm.h>
  33#include <linux/mman.h>
  34#include <linux/file.h>
  35#include <linux/swap.h>
  36#include <linux/pagemap.h>
  37#include <linux/string.h>
  38#include <linux/slab.h>
  39#include <linux/backing-dev.h>
  40#include <linux/shmem_fs.h>
  41#include <linux/mount.h>
  42#include <linux/writeback.h>
  43#include <linux/vfs.h>
  44#include <linux/blkdev.h>
  45#include <linux/security.h>
  46#include <linux/swapops.h>
  47#include <linux/mempolicy.h>
  48#include <linux/namei.h>
  49#include <linux/ctype.h>
  50#include <linux/migrate.h>
  51#include <linux/highmem.h>
  52
  53#include <asm/uaccess.h>
  54#include <asm/div64.h>
  55#include <asm/pgtable.h>
  56
  57/* This magic number is used in glibc for posix shared memory */
  58#define TMPFS_MAGIC     0x01021994
  59
  60#define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long))
  61#define ENTRIES_PER_PAGEPAGE (ENTRIES_PER_PAGE*ENTRIES_PER_PAGE)
  62#define BLOCKS_PER_PAGE  (PAGE_CACHE_SIZE/512)
  63
  64#define SHMEM_MAX_INDEX  (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1))
  65#define SHMEM_MAX_BYTES  ((unsigned long long)SHMEM_MAX_INDEX << PAGE_CACHE_SHIFT)
  66
  67#define VM_ACCT(size)    (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT)
  68
  69/* info->flags needs VM_flags to handle pagein/truncate races efficiently */
  70#define SHMEM_PAGEIN     VM_READ
  71#define SHMEM_TRUNCATE   VM_WRITE
  72
  73/* Definition to limit shmem_truncate's steps between cond_rescheds */
  74#define LATENCY_LIMIT    64
  75
  76/* Pretend that each entry is of this size in directory's i_size */
  77#define BOGO_DIRENT_SIZE 20
  78
  79/* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */
  80enum sgp_type {
  81        SGP_QUICK,      /* don't try more than file page cache lookup */
  82        SGP_READ,       /* don't exceed i_size, don't allocate page */
  83        SGP_CACHE,      /* don't exceed i_size, may allocate page */
  84        SGP_WRITE,      /* may exceed i_size, may allocate page */
  85        SGP_FAULT,      /* same as SGP_CACHE, return with page locked */
  86};
  87
  88static int shmem_getpage(struct inode *inode, unsigned long idx,
  89                         struct page **pagep, enum sgp_type sgp, int *type);
  90
  91static inline struct page *shmem_dir_alloc(gfp_t gfp_mask)
  92{
  93        /*
  94         * The above definition of ENTRIES_PER_PAGE, and the use of
  95         * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE:
  96         * might be reconsidered if it ever diverges from PAGE_SIZE.
  97         *
  98         * Mobility flags are masked out as swap vectors cannot move
  99         */
 100        return alloc_pages((gfp_mask & ~GFP_MOVABLE_MASK) | __GFP_ZERO,
 101                                PAGE_CACHE_SHIFT-PAGE_SHIFT);
 102}
 103
 104static inline void shmem_dir_free(struct page *page)
 105{
 106        __free_pages(page, PAGE_CACHE_SHIFT-PAGE_SHIFT);
 107}
 108
 109static struct page **shmem_dir_map(struct page *page)
 110{
 111        return (struct page **)kmap_atomic(page, KM_USER0);
 112}
 113
 114static inline void shmem_dir_unmap(struct page **dir)
 115{
 116        kunmap_atomic(dir, KM_USER0);
 117}
 118
 119static swp_entry_t *shmem_swp_map(struct page *page)
 120{
 121        return (swp_entry_t *)kmap_atomic(page, KM_USER1);
 122}
 123
 124static inline void shmem_swp_balance_unmap(void)
 125{
 126        /*
 127         * When passing a pointer to an i_direct entry, to code which
 128         * also handles indirect entries and so will shmem_swp_unmap,
 129         * we must arrange for the preempt count to remain in balance.
 130         * What kmap_atomic of a lowmem page does depends on config
 131         * and architecture, so pretend to kmap_atomic some lowmem page.
 132         */
 133        (void) kmap_atomic(ZERO_PAGE(0), KM_USER1);
 134}
 135
 136static inline void shmem_swp_unmap(swp_entry_t *entry)
 137{
 138        kunmap_atomic(entry, KM_USER1);
 139}
 140
 141static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
 142{
 143        return sb->s_fs_info;
 144}
 145
 146/*
 147 * shmem_file_setup pre-accounts the whole fixed size of a VM object,
 148 * for shared memory and for shared anonymous (/dev/zero) mappings
 149 * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1),
 150 * consistent with the pre-accounting of private mappings ...
 151 */
 152static inline int shmem_acct_size(unsigned long flags, loff_t size)
 153{
 154        return (flags & VM_ACCOUNT)?
 155                security_vm_enough_memory(VM_ACCT(size)): 0;
 156}
 157
 158static inline void shmem_unacct_size(unsigned long flags, loff_t size)
 159{
 160        if (flags & VM_ACCOUNT)
 161                vm_unacct_memory(VM_ACCT(size));
 162}
 163
 164/*
 165 * ... whereas tmpfs objects are accounted incrementally as
 166 * pages are allocated, in order to allow huge sparse files.
 167 * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
 168 * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
 169 */
 170static inline int shmem_acct_block(unsigned long flags)
 171{
 172        return (flags & VM_ACCOUNT)?
 173                0: security_vm_enough_memory(VM_ACCT(PAGE_CACHE_SIZE));
 174}
 175
 176static inline void shmem_unacct_blocks(unsigned long flags, long pages)
 177{
 178        if (!(flags & VM_ACCOUNT))
 179                vm_unacct_memory(pages * VM_ACCT(PAGE_CACHE_SIZE));
 180}
 181
 182static const struct super_operations shmem_ops;
 183static const struct address_space_operations shmem_aops;
 184static const struct file_operations shmem_file_operations;
 185static const struct inode_operations shmem_inode_operations;
 186static const struct inode_operations shmem_dir_inode_operations;
 187static const struct inode_operations shmem_special_inode_operations;
 188static struct vm_operations_struct shmem_vm_ops;
 189
 190static struct backing_dev_info shmem_backing_dev_info  __read_mostly = {
 191        .ra_pages       = 0,    /* No readahead */
 192        .capabilities   = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
 193        .unplug_io_fn   = default_unplug_io_fn,
 194};
 195
 196static LIST_HEAD(shmem_swaplist);
 197static DEFINE_SPINLOCK(shmem_swaplist_lock);
 198
 199static void shmem_free_blocks(struct inode *inode, long pages)
 200{
 201        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
 202        if (sbinfo->max_blocks) {
 203                spin_lock(&sbinfo->stat_lock);
 204                sbinfo->free_blocks += pages;
 205                inode->i_blocks -= pages*BLOCKS_PER_PAGE;
 206                spin_unlock(&sbinfo->stat_lock);
 207        }
 208}
 209
 210/*
 211 * shmem_recalc_inode - recalculate the size of an inode
 212 *
 213 * @inode: inode to recalc
 214 *
 215 * We have to calculate the free blocks since the mm can drop
 216 * undirtied hole pages behind our back.
 217 *
 218 * But normally   info->alloced == inode->i_mapping->nrpages + info->swapped
 219 * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
 220 *
 221 * It has to be called with the spinlock held.
 222 */
 223static void shmem_recalc_inode(struct inode *inode)
 224{
 225        struct shmem_inode_info *info = SHMEM_I(inode);
 226        long freed;
 227
 228        freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
 229        if (freed > 0) {
 230                info->alloced -= freed;
 231                shmem_unacct_blocks(info->flags, freed);
 232                shmem_free_blocks(inode, freed);
 233        }
 234}
 235
 236/*
 237 * shmem_swp_entry - find the swap vector position in the info structure
 238 *
 239 * @info:  info structure for the inode
 240 * @index: index of the page to find
 241 * @page:  optional page to add to the structure. Has to be preset to
 242 *         all zeros
 243 *
 244 * If there is no space allocated yet it will return NULL when
 245 * page is NULL, else it will use the page for the needed block,
 246 * setting it to NULL on return to indicate that it has been used.
 247 *
 248 * The swap vector is organized the following way:
 249 *
 250 * There are SHMEM_NR_DIRECT entries directly stored in the
 251 * shmem_inode_info structure. So small files do not need an addional
 252 * allocation.
 253 *
 254 * For pages with index > SHMEM_NR_DIRECT there is the pointer
 255 * i_indirect which points to a page which holds in the first half
 256 * doubly indirect blocks, in the second half triple indirect blocks:
 257 *
 258 * For an artificial ENTRIES_PER_PAGE = 4 this would lead to the
 259 * following layout (for SHMEM_NR_DIRECT == 16):
 260 *
 261 * i_indirect -> dir --> 16-19
 262 *            |      +-> 20-23
 263 *            |
 264 *            +-->dir2 --> 24-27
 265 *            |        +-> 28-31
 266 *            |        +-> 32-35
 267 *            |        +-> 36-39
 268 *            |
 269 *            +-->dir3 --> 40-43
 270 *                     +-> 44-47
 271 *                     +-> 48-51
 272 *                     +-> 52-55
 273 */
 274static swp_entry_t *shmem_swp_entry(struct shmem_inode_info *info, unsigned long index, struct page **page)
 275{
 276        unsigned long offset;
 277        struct page **dir;
 278        struct page *subdir;
 279
 280        if (index < SHMEM_NR_DIRECT) {
 281                shmem_swp_balance_unmap();
 282                return info->i_direct+index;
 283        }
 284        if (!info->i_indirect) {
 285                if (page) {
 286                        info->i_indirect = *page;
 287                        *page = NULL;
 288                }
 289                return NULL;                    /* need another page */
 290        }
 291
 292        index -= SHMEM_NR_DIRECT;
 293        offset = index % ENTRIES_PER_PAGE;
 294        index /= ENTRIES_PER_PAGE;
 295        dir = shmem_dir_map(info->i_indirect);
 296
 297        if (index >= ENTRIES_PER_PAGE/2) {
 298                index -= ENTRIES_PER_PAGE/2;
 299                dir += ENTRIES_PER_PAGE/2 + index/ENTRIES_PER_PAGE;
 300                index %= ENTRIES_PER_PAGE;
 301                subdir = *dir;
 302                if (!subdir) {
 303                        if (page) {
 304                                *dir = *page;
 305                                *page = NULL;
 306                        }
 307                        shmem_dir_unmap(dir);
 308                        return NULL;            /* need another page */
 309                }
 310                shmem_dir_unmap(dir);
 311                dir = shmem_dir_map(subdir);
 312        }
 313
 314        dir += index;
 315        subdir = *dir;
 316        if (!subdir) {
 317                if (!page || !(subdir = *page)) {
 318                        shmem_dir_unmap(dir);
 319                        return NULL;            /* need a page */
 320                }
 321                *dir = subdir;
 322                *page = NULL;
 323        }
 324        shmem_dir_unmap(dir);
 325        return shmem_swp_map(subdir) + offset;
 326}
 327
 328static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, unsigned long value)
 329{
 330        long incdec = value? 1: -1;
 331
 332        entry->val = value;
 333        info->swapped += incdec;
 334        if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) {
 335                struct page *page = kmap_atomic_to_page(entry);
 336                set_page_private(page, page_private(page) + incdec);
 337        }
 338}
 339
 340/*
 341 * shmem_swp_alloc - get the position of the swap entry for the page.
 342 *                   If it does not exist allocate the entry.
 343 *
 344 * @info:       info structure for the inode
 345 * @index:      index of the page to find
 346 * @sgp:        check and recheck i_size? skip allocation?
 347 */
 348static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long index, enum sgp_type sgp)
 349{
 350        struct inode *inode = &info->vfs_inode;
 351        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
 352        struct page *page = NULL;
 353        swp_entry_t *entry;
 354
 355        if (sgp != SGP_WRITE &&
 356            ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode))
 357                return ERR_PTR(-EINVAL);
 358
 359        while (!(entry = shmem_swp_entry(info, index, &page))) {
 360                if (sgp == SGP_READ)
 361                        return shmem_swp_map(ZERO_PAGE(0));
 362                /*
 363                 * Test free_blocks against 1 not 0, since we have 1 data
 364                 * page (and perhaps indirect index pages) yet to allocate:
 365                 * a waste to allocate index if we cannot allocate data.
 366                 */
 367                if (sbinfo->max_blocks) {
 368                        spin_lock(&sbinfo->stat_lock);
 369                        if (sbinfo->free_blocks <= 1) {
 370                                spin_unlock(&sbinfo->stat_lock);
 371                                return ERR_PTR(-ENOSPC);
 372                        }
 373                        sbinfo->free_blocks--;
 374                        inode->i_blocks += BLOCKS_PER_PAGE;
 375                        spin_unlock(&sbinfo->stat_lock);
 376                }
 377
 378                spin_unlock(&info->lock);
 379                page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping));
 380                if (page)
 381                        set_page_private(page, 0);
 382                spin_lock(&info->lock);
 383
 384                if (!page) {
 385                        shmem_free_blocks(inode, 1);
 386                        return ERR_PTR(-ENOMEM);
 387                }
 388                if (sgp != SGP_WRITE &&
 389                    ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
 390                        entry = ERR_PTR(-EINVAL);
 391                        break;
 392                }
 393                if (info->next_index <= index)
 394                        info->next_index = index + 1;
 395        }
 396        if (page) {
 397                /* another task gave its page, or truncated the file */
 398                shmem_free_blocks(inode, 1);
 399                shmem_dir_free(page);
 400        }
 401        if (info->next_index <= index && !IS_ERR(entry))
 402                info->next_index = index + 1;
 403        return entry;
 404}
 405
 406/*
 407 * shmem_free_swp - free some swap entries in a directory
 408 *
 409 * @dir:        pointer to the directory
 410 * @edir:       pointer after last entry of the directory
 411 * @punch_lock: pointer to spinlock when needed for the holepunch case
 412 */
 413static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir,
 414                                                spinlock_t *punch_lock)
 415{
 416        spinlock_t *punch_unlock = NULL;
 417        swp_entry_t *ptr;
 418        int freed = 0;
 419
 420        for (ptr = dir; ptr < edir; ptr++) {
 421                if (ptr->val) {
 422                        if (unlikely(punch_lock)) {
 423                                punch_unlock = punch_lock;
 424                                punch_lock = NULL;
 425                                spin_lock(punch_unlock);
 426                                if (!ptr->val)
 427                                        continue;
 428                        }
 429                        free_swap_and_cache(*ptr);
 430                        *ptr = (swp_entry_t){0};
 431                        freed++;
 432                }
 433        }
 434        if (punch_unlock)
 435                spin_unlock(punch_unlock);
 436        return freed;
 437}
 438
 439static int shmem_map_and_free_swp(struct page *subdir, int offset,
 440                int limit, struct page ***dir, spinlock_t *punch_lock)
 441{
 442        swp_entry_t *ptr;
 443        int freed = 0;
 444
 445        ptr = shmem_swp_map(subdir);
 446        for (; offset < limit; offset += LATENCY_LIMIT) {
 447                int size = limit - offset;
 448                if (size > LATENCY_LIMIT)
 449                        size = LATENCY_LIMIT;
 450                freed += shmem_free_swp(ptr+offset, ptr+offset+size,
 451                                                        punch_lock);
 452                if (need_resched()) {
 453                        shmem_swp_unmap(ptr);
 454                        if (*dir) {
 455                                shmem_dir_unmap(*dir);
 456                                *dir = NULL;
 457                        }
 458                        cond_resched();
 459                        ptr = shmem_swp_map(subdir);
 460                }
 461        }
 462        shmem_swp_unmap(ptr);
 463        return freed;
 464}
 465
 466static void shmem_free_pages(struct list_head *next)
 467{
 468        struct page *page;
 469        int freed = 0;
 470
 471        do {
 472                page = container_of(next, struct page, lru);
 473                next = next->next;
 474                shmem_dir_free(page);
 475                freed++;
 476                if (freed >= LATENCY_LIMIT) {
 477                        cond_resched();
 478                        freed = 0;
 479                }
 480        } while (next);
 481}
 482
 483static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
 484{
 485        struct shmem_inode_info *info = SHMEM_I(inode);
 486        unsigned long idx;
 487        unsigned long size;
 488        unsigned long limit;
 489        unsigned long stage;
 490        unsigned long diroff;
 491        struct page **dir;
 492        struct page *topdir;
 493        struct page *middir;
 494        struct page *subdir;
 495        swp_entry_t *ptr;
 496        LIST_HEAD(pages_to_free);
 497        long nr_pages_to_free = 0;
 498        long nr_swaps_freed = 0;
 499        int offset;
 500        int freed;
 501        int punch_hole;
 502        spinlock_t *needs_lock;
 503        spinlock_t *punch_lock;
 504        unsigned long upper_limit;
 505
 506        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
 507        idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 508        if (idx >= info->next_index)
 509                return;
 510
 511        spin_lock(&info->lock);
 512        info->flags |= SHMEM_TRUNCATE;
 513        if (likely(end == (loff_t) -1)) {
 514                limit = info->next_index;
 515                upper_limit = SHMEM_MAX_INDEX;
 516                info->next_index = idx;
 517                needs_lock = NULL;
 518                punch_hole = 0;
 519        } else {
 520                if (end + 1 >= inode->i_size) { /* we may free a little more */
 521                        limit = (inode->i_size + PAGE_CACHE_SIZE - 1) >>
 522                                                        PAGE_CACHE_SHIFT;
 523                        upper_limit = SHMEM_MAX_INDEX;
 524                } else {
 525                        limit = (end + 1) >> PAGE_CACHE_SHIFT;
 526                        upper_limit = limit;
 527                }
 528                needs_lock = &info->lock;
 529                punch_hole = 1;
 530        }
 531
 532        topdir = info->i_indirect;
 533        if (topdir && idx <= SHMEM_NR_DIRECT && !punch_hole) {
 534                info->i_indirect = NULL;
 535                nr_pages_to_free++;
 536                list_add(&topdir->lru, &pages_to_free);
 537        }
 538        spin_unlock(&info->lock);
 539
 540        if (info->swapped && idx < SHMEM_NR_DIRECT) {
 541                ptr = info->i_direct;
 542                size = limit;
 543                if (size > SHMEM_NR_DIRECT)
 544                        size = SHMEM_NR_DIRECT;
 545                nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size, needs_lock);
 546        }
 547
 548        /*
 549         * If there are no indirect blocks or we are punching a hole
 550         * below indirect blocks, nothing to be done.
 551         */
 552        if (!topdir || limit <= SHMEM_NR_DIRECT)
 553                goto done2;
 554
 555        /*
 556         * The truncation case has already dropped info->lock, and we're safe
 557         * because i_size and next_index have already been lowered, preventing
 558         * access beyond.  But in the punch_hole case, we still need to take
 559         * the lock when updating the swap directory, because there might be
 560         * racing accesses by shmem_getpage(SGP_CACHE), shmem_unuse_inode or
 561         * shmem_writepage.  However, whenever we find we can remove a whole
 562         * directory page (not at the misaligned start or end of the range),
 563         * we first NULLify its pointer in the level above, and then have no
 564         * need to take the lock when updating its contents: needs_lock and
 565         * punch_lock (either pointing to info->lock or NULL) manage this.
 566         */
 567
 568        upper_limit -= SHMEM_NR_DIRECT;
 569        limit -= SHMEM_NR_DIRECT;
 570        idx = (idx > SHMEM_NR_DIRECT)? (idx - SHMEM_NR_DIRECT): 0;
 571        offset = idx % ENTRIES_PER_PAGE;
 572        idx -= offset;
 573
 574        dir = shmem_dir_map(topdir);
 575        stage = ENTRIES_PER_PAGEPAGE/2;
 576        if (idx < ENTRIES_PER_PAGEPAGE/2) {
 577                middir = topdir;
 578                diroff = idx/ENTRIES_PER_PAGE;
 579        } else {
 580                dir += ENTRIES_PER_PAGE/2;
 581                dir += (idx - ENTRIES_PER_PAGEPAGE/2)/ENTRIES_PER_PAGEPAGE;
 582                while (stage <= idx)
 583                        stage += ENTRIES_PER_PAGEPAGE;
 584                middir = *dir;
 585                if (*dir) {
 586                        diroff = ((idx - ENTRIES_PER_PAGEPAGE/2) %
 587                                ENTRIES_PER_PAGEPAGE) / ENTRIES_PER_PAGE;
 588                        if (!diroff && !offset && upper_limit >= stage) {
 589                                if (needs_lock) {
 590                                        spin_lock(needs_lock);
 591                                        *dir = NULL;
 592                                        spin_unlock(needs_lock);
 593                                        needs_lock = NULL;
 594                                } else
 595                                        *dir = NULL;
 596                                nr_pages_to_free++;
 597                                list_add(&middir->lru, &pages_to_free);
 598                        }
 599                        shmem_dir_unmap(dir);
 600                        dir = shmem_dir_map(middir);
 601                } else {
 602                        diroff = 0;
 603                        offset = 0;
 604                        idx = stage;
 605                }
 606        }
 607
 608        for (; idx < limit; idx += ENTRIES_PER_PAGE, diroff++) {
 609                if (unlikely(idx == stage)) {
 610                        shmem_dir_unmap(dir);
 611                        dir = shmem_dir_map(topdir) +
 612                            ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
 613                        while (!*dir) {
 614                                dir++;
 615                                idx += ENTRIES_PER_PAGEPAGE;
 616                                if (idx >= limit)
 617                                        goto done1;
 618                        }
 619                        stage = idx + ENTRIES_PER_PAGEPAGE;
 620                        middir = *dir;
 621                        if (punch_hole)
 622                                needs_lock = &info->lock;
 623                        if (upper_limit >= stage) {
 624                                if (needs_lock) {
 625                                        spin_lock(needs_lock);
 626                                        *dir = NULL;
 627                                        spin_unlock(needs_lock);
 628                                        needs_lock = NULL;
 629                                } else
 630                                        *dir = NULL;
 631                                nr_pages_to_free++;
 632                                list_add(&middir->lru, &pages_to_free);
 633                        }
 634                        shmem_dir_unmap(dir);
 635                        cond_resched();
 636                        dir = shmem_dir_map(middir);
 637                        diroff = 0;
 638                }
 639                punch_lock = needs_lock;
 640                subdir = dir[diroff];
 641                if (subdir && !offset && upper_limit-idx >= ENTRIES_PER_PAGE) {
 642                        if (needs_lock) {
 643                                spin_lock(needs_lock);
 644                                dir[diroff] = NULL;
 645                                spin_unlock(needs_lock);
 646                                punch_lock = NULL;
 647                        } else
 648                                dir[diroff] = NULL;
 649                        nr_pages_to_free++;
 650                        list_add(&subdir->lru, &pages_to_free);
 651                }
 652                if (subdir && page_private(subdir) /* has swap entries */) {
 653                        size = limit - idx;
 654                        if (size > ENTRIES_PER_PAGE)
 655                                size = ENTRIES_PER_PAGE;
 656                        freed = shmem_map_and_free_swp(subdir,
 657                                        offset, size, &dir, punch_lock);
 658                        if (!dir)
 659                                dir = shmem_dir_map(middir);
 660                        nr_swaps_freed += freed;
 661                        if (offset || punch_lock) {
 662                                spin_lock(&info->lock);
 663                                set_page_private(subdir,
 664                                        page_private(subdir) - freed);
 665                                spin_unlock(&info->lock);
 666                        } else
 667                                BUG_ON(page_private(subdir) != freed);
 668                }
 669                offset = 0;
 670        }
 671done1:
 672        shmem_dir_unmap(dir);
 673done2:
 674        if (inode->i_mapping->nrpages && (info->flags & SHMEM_PAGEIN)) {
 675                /*
 676                 * Call truncate_inode_pages again: racing shmem_unuse_inode
 677                 * may have swizzled a page in from swap since vmtruncate or
 678                 * generic_delete_inode did it, before we lowered next_index.
 679                 * Also, though shmem_getpage checks i_size before adding to
 680                 * cache, no recheck after: so fix the narrow window there too.
 681                 *
 682                 * Recalling truncate_inode_pages_range and unmap_mapping_range
 683                 * every time for punch_hole (which never got a chance to clear
 684                 * SHMEM_PAGEIN at the start of vmtruncate_range) is expensive,
 685                 * yet hardly ever necessary: try to optimize them out later.
 686                 */
 687                truncate_inode_pages_range(inode->i_mapping, start, end);
 688                if (punch_hole)
 689                        unmap_mapping_range(inode->i_mapping, start,
 690                                                        end - start, 1);
 691        }
 692
 693        spin_lock(&info->lock);
 694        info->flags &= ~SHMEM_TRUNCATE;
 695        info->swapped -= nr_swaps_freed;
 696        if (nr_pages_to_free)
 697                shmem_free_blocks(inode, nr_pages_to_free);
 698        shmem_recalc_inode(inode);
 699        spin_unlock(&info->lock);
 700
 701        /*
 702         * Empty swap vector directory pages to be freed?
 703         */
 704        if (!list_empty(&pages_to_free)) {
 705                pages_to_free.prev->next = NULL;
 706                shmem_free_pages(pages_to_free.next);
 707        }
 708}
 709
 710static void shmem_truncate(struct inode *inode)
 711{
 712        shmem_truncate_range(inode, inode->i_size, (loff_t)-1);
 713}
 714
 715static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
 716{
 717        struct inode *inode = dentry->d_inode;
 718        struct page *page = NULL;
 719        int error;
 720
 721        if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
 722                if (attr->ia_size < inode->i_size) {
 723                        /*
 724                         * If truncating down to a partial page, then
 725                         * if that page is already allocated, hold it
 726                         * in memory until the truncation is over, so
 727                         * truncate_partial_page cannnot miss it were
 728                         * it assigned to swap.
 729                         */
 730                        if (attr->ia_size & (PAGE_CACHE_SIZE-1)) {
 731                                (void) shmem_getpage(inode,
 732                                        attr->ia_size>>PAGE_CACHE_SHIFT,
 733                                                &page, SGP_READ, NULL);
 734                        }
 735                        /*
 736                         * Reset SHMEM_PAGEIN flag so that shmem_truncate can
 737                         * detect if any pages might have been added to cache
 738                         * after truncate_inode_pages.  But we needn't bother
 739                         * if it's being fully truncated to zero-length: the
 740                         * nrpages check is efficient enough in that case.
 741                         */
 742                        if (attr->ia_size) {
 743                                struct shmem_inode_info *info = SHMEM_I(inode);
 744                                spin_lock(&info->lock);
 745                                info->flags &= ~SHMEM_PAGEIN;
 746                                spin_unlock(&info->lock);
 747                        }
 748                }
 749        }
 750
 751        error = inode_change_ok(inode, attr);
 752        if (!error)
 753                error = inode_setattr(inode, attr);
 754#ifdef CONFIG_TMPFS_POSIX_ACL
 755        if (!error && (attr->ia_valid & ATTR_MODE))
 756                error = generic_acl_chmod(inode, &shmem_acl_ops);
 757#endif
 758        if (page)
 759                page_cache_release(page);
 760        return error;
 761}
 762
 763static void shmem_delete_inode(struct inode *inode)
 764{
 765        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
 766        struct shmem_inode_info *info = SHMEM_I(inode);
 767
 768        if (inode->i_op->truncate == shmem_truncate) {
 769                truncate_inode_pages(inode->i_mapping, 0);
 770                shmem_unacct_size(info->flags, inode->i_size);
 771                inode->i_size = 0;
 772                shmem_truncate(inode);
 773                if (!list_empty(&info->swaplist)) {
 774                        spin_lock(&shmem_swaplist_lock);
 775                        list_del_init(&info->swaplist);
 776                        spin_unlock(&shmem_swaplist_lock);
 777                }
 778        }
 779        BUG_ON(inode->i_blocks);
 780        if (sbinfo->max_inodes) {
 781                spin_lock(&sbinfo->stat_lock);
 782                sbinfo->free_inodes++;
 783                spin_unlock(&sbinfo->stat_lock);
 784        }
 785        clear_inode(inode);
 786}
 787
 788static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_t *edir)
 789{
 790        swp_entry_t *ptr;
 791
 792        for (ptr = dir; ptr < edir; ptr++) {
 793                if (ptr->val == entry.val)
 794                        return ptr - dir;
 795        }
 796        return -1;
 797}
 798
 799static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page)
 800{
 801        struct inode *inode;
 802        unsigned long idx;
 803        unsigned long size;
 804        unsigned long limit;
 805        unsigned long stage;
 806        struct page **dir;
 807        struct page *subdir;
 808        swp_entry_t *ptr;
 809        int offset;
 810
 811        idx = 0;
 812        ptr = info->i_direct;
 813        spin_lock(&info->lock);
 814        limit = info->next_index;
 815        size = limit;
 816        if (size > SHMEM_NR_DIRECT)
 817                size = SHMEM_NR_DIRECT;
 818        offset = shmem_find_swp(entry, ptr, ptr+size);
 819        if (offset >= 0) {
 820                shmem_swp_balance_unmap();
 821                goto found;
 822        }
 823        if (!info->i_indirect)
 824                goto lost2;
 825
 826        dir = shmem_dir_map(info->i_indirect);
 827        stage = SHMEM_NR_DIRECT + ENTRIES_PER_PAGEPAGE/2;
 828
 829        for (idx = SHMEM_NR_DIRECT; idx < limit; idx += ENTRIES_PER_PAGE, dir++) {
 830                if (unlikely(idx == stage)) {
 831                        shmem_dir_unmap(dir-1);
 832                        dir = shmem_dir_map(info->i_indirect) +
 833                            ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
 834                        while (!*dir) {
 835                                dir++;
 836                                idx += ENTRIES_PER_PAGEPAGE;
 837                                if (idx >= limit)
 838                                        goto lost1;
 839                        }
 840                        stage = idx + ENTRIES_PER_PAGEPAGE;
 841                        subdir = *dir;
 842                        shmem_dir_unmap(dir);
 843                        dir = shmem_dir_map(subdir);
 844                }
 845                subdir = *dir;
 846                if (subdir && page_private(subdir)) {
 847                        ptr = shmem_swp_map(subdir);
 848                        size = limit - idx;
 849                        if (size > ENTRIES_PER_PAGE)
 850                                size = ENTRIES_PER_PAGE;
 851                        offset = shmem_find_swp(entry, ptr, ptr+size);
 852                        if (offset >= 0) {
 853                                shmem_dir_unmap(dir);
 854                                goto found;
 855                        }
 856                        shmem_swp_unmap(ptr);
 857                }
 858        }
 859lost1:
 860        shmem_dir_unmap(dir-1);
 861lost2:
 862        spin_unlock(&info->lock);
 863        return 0;
 864found:
 865        idx += offset;
 866        inode = &info->vfs_inode;
 867        if (move_from_swap_cache(page, idx, inode->i_mapping) == 0) {
 868                info->flags |= SHMEM_PAGEIN;
 869                shmem_swp_set(info, ptr + offset, 0);
 870        }
 871        shmem_swp_unmap(ptr);
 872        spin_unlock(&info->lock);
 873        /*
 874         * Decrement swap count even when the entry is left behind:
 875         * try_to_unuse will skip over mms, then reincrement count.
 876         */
 877        swap_free(entry);
 878        return 1;
 879}
 880
 881/*
 882 * shmem_unuse() search for an eventually swapped out shmem page.
 883 */
 884int shmem_unuse(swp_entry_t entry, struct page *page)
 885{
 886        struct list_head *p, *next;
 887        struct shmem_inode_info *info;
 888        int found = 0;
 889
 890        spin_lock(&shmem_swaplist_lock);
 891        list_for_each_safe(p, next, &shmem_swaplist) {
 892                info = list_entry(p, struct shmem_inode_info, swaplist);
 893                if (!info->swapped)
 894                        list_del_init(&info->swaplist);
 895                else if (shmem_unuse_inode(info, entry, page)) {
 896                        /* move head to start search for next from here */
 897                        list_move_tail(&shmem_swaplist, &info->swaplist);
 898                        found = 1;
 899                        break;
 900                }
 901        }
 902        spin_unlock(&shmem_swaplist_lock);
 903        return found;
 904}
 905
 906/*
 907 * Move the page from the page cache to the swap cache.
 908 */
 909static int shmem_writepage(struct page *page, struct writeback_control *wbc)
 910{
 911        struct shmem_inode_info *info;
 912        swp_entry_t *entry, swap;
 913        struct address_space *mapping;
 914        unsigned long index;
 915        struct inode *inode;
 916
 917        BUG_ON(!PageLocked(page));
 918        /*
 919         * shmem_backing_dev_info's capabilities prevent regular writeback or
 920         * sync from ever calling shmem_writepage; but a stacking filesystem
 921         * may use the ->writepage of its underlying filesystem, in which case
 922         * we want to do nothing when that underlying filesystem is tmpfs
 923         * (writing out to swap is useful as a response to memory pressure, but
 924         * of no use to stabilize the data) - just redirty the page, unlock it
 925         * and claim success in this case.  AOP_WRITEPAGE_ACTIVATE, and the
 926         * page_mapped check below, must be avoided unless we're in reclaim.
 927         */
 928        if (!wbc->for_reclaim) {
 929                set_page_dirty(page);
 930                unlock_page(page);
 931                return 0;
 932        }
 933        BUG_ON(page_mapped(page));
 934
 935        mapping = page->mapping;
 936        index = page->index;
 937        inode = mapping->host;
 938        info = SHMEM_I(inode);
 939        if (info->flags & VM_LOCKED)
 940                goto redirty;
 941        swap = get_swap_page();
 942        if (!swap.val)
 943                goto redirty;
 944
 945        spin_lock(&info->lock);
 946        shmem_recalc_inode(inode);
 947        if (index >= info->next_index) {
 948                BUG_ON(!(info->flags & SHMEM_TRUNCATE));
 949                goto unlock;
 950        }
 951        entry = shmem_swp_entry(info, index, NULL);
 952        BUG_ON(!entry);
 953        BUG_ON(entry->val);
 954
 955        if (move_to_swap_cache(page, swap) == 0) {
 956                shmem_swp_set(info, entry, swap.val);
 957                shmem_swp_unmap(entry);
 958                spin_unlock(&info->lock);
 959                if (list_empty(&info->swaplist)) {
 960                        spin_lock(&shmem_swaplist_lock);
 961                        /* move instead of add in case we're racing */
 962                        list_move_tail(&info->swaplist, &shmem_swaplist);
 963                        spin_unlock(&shmem_swaplist_lock);
 964                }
 965                unlock_page(page);
 966                return 0;
 967        }
 968
 969        shmem_swp_unmap(entry);
 970unlock:
 971        spin_unlock(&info->lock);
 972        swap_free(swap);
 973redirty:
 974        set_page_dirty(page);
 975        return AOP_WRITEPAGE_ACTIVATE;  /* Return with the page locked */
 976}
 977
 978#ifdef CONFIG_NUMA
 979static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes)
 980{
 981        char *nodelist = strchr(value, ':');
 982        int err = 1;
 983
 984        if (nodelist) {
 985                /* NUL-terminate policy string */
 986                *nodelist++ = '\0';
 987                if (nodelist_parse(nodelist, *policy_nodes))
 988                        goto out;
 989                if (!nodes_subset(*policy_nodes, node_states[N_HIGH_MEMORY]))
 990                        goto out;
 991        }
 992        if (!strcmp(value, "default")) {
 993                *policy = MPOL_DEFAULT;
 994                /* Don't allow a nodelist */
 995                if (!nodelist)
 996                        err = 0;
 997        } else if (!strcmp(value, "prefer")) {
 998                *policy = MPOL_PREFERRED;
 999                /* Insist on a nodelist of one node only */
1000                if (nodelist) {
1001                        char *rest = nodelist;
1002                        while (isdigit(*rest))
1003                                rest++;
1004                        if (!*rest)
1005                                err = 0;
1006                }
1007        } else if (!strcmp(value, "bind")) {
1008                *policy = MPOL_BIND;
1009                /* Insist on a nodelist */
1010                if (nodelist)
1011                        err = 0;
1012        } else if (!strcmp(value, "interleave")) {
1013                *policy = MPOL_INTERLEAVE;
1014                /*
1015                 * Default to online nodes with memory if no nodelist
1016                 */
1017                if (!nodelist)
1018                        *policy_nodes = node_states[N_HIGH_MEMORY];
1019                err = 0;
1020        }
1021out:
1022        /* Restore string for error message */
1023        if (nodelist)
1024                *--nodelist = ':';
1025        return err;
1026}
1027
1028static struct page *shmem_swapin_async(struct shared_policy *p,
1029                                       swp_entry_t entry, unsigned long idx)
1030{
1031        struct page *page;
1032        struct vm_area_struct pvma;
1033
1034        /* Create a pseudo vma that just contains the policy */
1035        memset(&pvma, 0, sizeof(struct vm_area_struct));
1036        pvma.vm_end = PAGE_SIZE;
1037        pvma.vm_pgoff = idx;
1038        pvma.vm_policy = mpol_shared_policy_lookup(p, idx);
1039        page = read_swap_cache_async(entry, &pvma, 0);
1040        mpol_free(pvma.vm_policy);
1041        return page;
1042}
1043
1044static struct page *shmem_swapin(struct shmem_inode_info *info,
1045                                 swp_entry_t entry, unsigned long idx)
1046{
1047        struct shared_policy *p = &info->policy;
1048        int i, num;
1049        struct page *page;
1050        unsigned long offset;
1051
1052        num = valid_swaphandles(entry, &offset);
1053        for (i = 0; i < num; offset++, i++) {
1054                page = shmem_swapin_async(p,
1055                                swp_entry(swp_type(entry), offset), idx);
1056                if (!page)
1057                        break;
1058                page_cache_release(page);
1059        }
1060        lru_add_drain();        /* Push any new pages onto the LRU now */
1061        return shmem_swapin_async(p, entry, idx);
1062}
1063
1064static struct page *
1065shmem_alloc_page(gfp_t gfp, struct shmem_inode_info *info,
1066                 unsigned long idx)
1067{
1068        struct vm_area_struct pvma;
1069        struct page *page;
1070
1071        memset(&pvma, 0, sizeof(struct vm_area_struct));
1072        pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
1073        pvma.vm_pgoff = idx;
1074        pvma.vm_end = PAGE_SIZE;
1075        page = alloc_page_vma(gfp, &pvma, 0);
1076        mpol_free(pvma.vm_policy);
1077        return page;
1078}
1079#else
1080static inline int shmem_parse_mpol(char *value, int *policy,
1081                                                nodemask_t *policy_nodes)
1082{
1083        return 1;
1084}
1085
1086static inline struct page *
1087shmem_swapin(struct shmem_inode_info *info,swp_entry_t entry,unsigned long idx)
1088{
1089        swapin_readahead(entry, 0, NULL);
1090        return read_swap_cache_async(entry, NULL, 0);
1091}
1092
1093static inline struct page *
1094shmem_alloc_page(gfp_t gfp,struct shmem_inode_info *info, unsigned long idx)
1095{
1096        return alloc_page(gfp);
1097}
1098#endif
1099
1100/*
1101 * shmem_getpage - either get the page from swap or allocate a new one
1102 *
1103 * If we allocate a new one we do not mark it dirty. That's up to the
1104 * vm. If we swap it in we mark it dirty since we also free the swap
1105 * entry since a page cannot live in both the swap and page cache
1106 */
1107static int shmem_getpage(struct inode *inode, unsigned long idx,
1108                        struct page **pagep, enum sgp_type sgp, int *type)
1109{
1110        struct address_space *mapping = inode->i_mapping;
1111        struct shmem_inode_info *info = SHMEM_I(inode);
1112        struct shmem_sb_info *sbinfo;
1113        struct page *filepage = *pagep;
1114        struct page *swappage;
1115        swp_entry_t *entry;
1116        swp_entry_t swap;
1117        int error;
1118
1119        if (idx >= SHMEM_MAX_INDEX)
1120                return -EFBIG;
1121
1122        if (type)
1123                *type = 0;
1124
1125        /*
1126         * Normally, filepage is NULL on entry, and either found
1127         * uptodate immediately, or allocated and zeroed, or read
1128         * in under swappage, which is then assigned to filepage.
1129         * But shmem_readpage and shmem_write_begin pass in a locked
1130         * filepage, which may be found not uptodate by other callers
1131         * too, and may need to be copied from the swappage read in.
1132         */
1133repeat:
1134        if (!filepage)
1135                filepage = find_lock_page(mapping, idx);
1136        if (filepage && PageUptodate(filepage))
1137                goto done;
1138        error = 0;
1139        if (sgp == SGP_QUICK)
1140                goto failed;
1141
1142        spin_lock(&info->lock);
1143        shmem_recalc_inode(inode);
1144        entry = shmem_swp_alloc(info, idx, sgp);
1145        if (IS_ERR(entry)) {
1146                spin_unlock(&info->lock);
1147                error = PTR_ERR(entry);
1148                goto failed;
1149        }
1150        swap = *entry;
1151
1152        if (swap.val) {
1153                /* Look it up and read it in.. */
1154                swappage = lookup_swap_cache(swap);
1155                if (!swappage) {
1156                        shmem_swp_unmap(entry);
1157                        /* here we actually do the io */
1158                        if (type && !(*type & VM_FAULT_MAJOR)) {
1159                                __count_vm_event(PGMAJFAULT);
1160                                *type |= VM_FAULT_MAJOR;
1161                        }
1162                        spin_unlock(&info->lock);
1163                        swappage = shmem_swapin(info, swap, idx);
1164                        if (!swappage) {
1165                                spin_lock(&info->lock);
1166                                entry = shmem_swp_alloc(info, idx, sgp);
1167                                if (IS_ERR(entry))
1168                                        error = PTR_ERR(entry);
1169                                else {
1170                                        if (entry->val == swap.val)
1171                                                error = -ENOMEM;
1172                                        shmem_swp_unmap(entry);
1173                                }
1174                                spin_unlock(&info->lock);
1175                                if (error)
1176                                        goto failed;
1177                                goto repeat;
1178                        }
1179                        wait_on_page_locked(swappage);
1180                        page_cache_release(swappage);
1181                        goto repeat;
1182                }
1183
1184                /* We have to do this with page locked to prevent races */
1185                if (TestSetPageLocked(swappage)) {
1186                        shmem_swp_unmap(entry);
1187                        spin_unlock(&info->lock);
1188                        wait_on_page_locked(swappage);
1189                        page_cache_release(swappage);
1190                        goto repeat;
1191                }
1192                if (PageWriteback(swappage)) {
1193                        shmem_swp_unmap(entry);
1194                        spin_unlock(&info->lock);
1195                        wait_on_page_writeback(swappage);
1196                        unlock_page(swappage);
1197                        page_cache_release(swappage);
1198                        goto repeat;
1199                }
1200                if (!PageUptodate(swappage)) {
1201                        shmem_swp_unmap(entry);
1202                        spin_unlock(&info->lock);
1203                        unlock_page(swappage);
1204                        page_cache_release(swappage);
1205                        error = -EIO;
1206                        goto failed;
1207                }
1208
1209                if (filepage) {
1210                        shmem_swp_set(info, entry, 0);
1211                        shmem_swp_unmap(entry);
1212                        delete_from_swap_cache(swappage);
1213                        spin_unlock(&info->lock);
1214                        copy_highpage(filepage, swappage);
1215                        unlock_page(swappage);
1216                        page_cache_release(swappage);
1217                        flush_dcache_page(filepage);
1218                        SetPageUptodate(filepage);
1219                        set_page_dirty(filepage);
1220                        swap_free(swap);
1221                } else if (!(error = move_from_swap_cache(
1222                                swappage, idx, mapping))) {
1223                        info->flags |= SHMEM_PAGEIN;
1224                        shmem_swp_set(info, entry, 0);
1225                        shmem_swp_unmap(entry);
1226                        spin_unlock(&info->lock);
1227                        filepage = swappage;
1228                        swap_free(swap);
1229                } else {
1230                        shmem_swp_unmap(entry);
1231                        spin_unlock(&info->lock);
1232                        unlock_page(swappage);
1233                        page_cache_release(swappage);
1234                        if (error == -ENOMEM) {
1235                                /* let kswapd refresh zone for GFP_ATOMICs */
1236                                congestion_wait(WRITE, HZ/50);
1237                        }
1238                        goto repeat;
1239                }
1240        } else if (sgp == SGP_READ && !filepage) {
1241                shmem_swp_unmap(entry);
1242                filepage = find_get_page(mapping, idx);
1243                if (filepage &&
1244                    (!PageUptodate(filepage) || TestSetPageLocked(filepage))) {
1245                        spin_unlock(&info->lock);
1246                        wait_on_page_locked(filepage);
1247                        page_cache_release(filepage);
1248                        filepage = NULL;
1249                        goto repeat;
1250                }
1251                spin_unlock(&info->lock);
1252        } else {
1253                shmem_swp_unmap(entry);
1254                sbinfo = SHMEM_SB(inode->i_sb);
1255                if (sbinfo->max_blocks) {
1256                        spin_lock(&sbinfo->stat_lock);
1257                        if (sbinfo->free_blocks == 0 ||
1258                            shmem_acct_block(info->flags)) {
1259                                spin_unlock(&sbinfo->stat_lock);
1260                                spin_unlock(&info->lock);
1261                                error = -ENOSPC;
1262                                goto failed;
1263                        }
1264                        sbinfo->free_blocks--;
1265                        inode->i_blocks += BLOCKS_PER_PAGE;
1266                        spin_unlock(&sbinfo->stat_lock);
1267                } else if (shmem_acct_block(info->flags)) {
1268                        spin_unlock(&info->lock);
1269                        error = -ENOSPC;
1270                        goto failed;
1271                }
1272
1273                if (!filepage) {
1274                        spin_unlock(&info->lock);
1275                        filepage = shmem_alloc_page(mapping_gfp_mask(mapping),
1276                                                    info,
1277                                                    idx);
1278                        if (!filepage) {
1279                                shmem_unacct_blocks(info->flags, 1);
1280                                shmem_free_blocks(inode, 1);
1281                                error = -ENOMEM;
1282                                goto failed;
1283                        }
1284
1285                        spin_lock(&info->lock);
1286                        entry = shmem_swp_alloc(info, idx, sgp);
1287                        if (IS_ERR(entry))
1288                                error = PTR_ERR(entry);
1289                        else {
1290                                swap = *entry;
1291                                shmem_swp_unmap(entry);
1292                        }
1293                        if (error || swap.val || 0 != add_to_page_cache_lru(
1294                                        filepage, mapping, idx, GFP_ATOMIC)) {
1295                                spin_unlock(&info->lock);
1296                                page_cache_release(filepage);
1297                                shmem_unacct_blocks(info->flags, 1);
1298                                shmem_free_blocks(inode, 1);
1299                                filepage = NULL;
1300                                if (error)
1301                                        goto failed;
1302                                goto repeat;
1303                        }
1304                        info->flags |= SHMEM_PAGEIN;
1305                }
1306
1307                info->alloced++;
1308                spin_unlock(&info->lock);
1309                clear_highpage(filepage);
1310                flush_dcache_page(filepage);
1311                SetPageUptodate(filepage);
1312        }
1313done:
1314        if (*pagep != filepage) {
1315                *pagep = filepage;
1316                if (sgp != SGP_FAULT)
1317                        unlock_page(filepage);
1318
1319        }
1320        return 0;
1321
1322failed:
1323        if (*pagep != filepage) {
1324                unlock_page(filepage);
1325                page_cache_release(filepage);
1326        }
1327        return error;
1328}
1329
1330static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1331{
1332        struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
1333        int error;
1334        int ret;
1335
1336        if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
1337                return VM_FAULT_SIGBUS;
1338
1339        error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_FAULT, &ret);
1340        if (error)
1341                return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
1342
1343        mark_page_accessed(vmf->page);
1344        return ret | VM_FAULT_LOCKED;
1345}
1346
1347#ifdef CONFIG_NUMA
1348static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
1349{
1350        struct inode *i = vma->vm_file->f_path.dentry->d_inode;
1351        return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new);
1352}
1353
1354static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
1355                                          unsigned long addr)
1356{
1357        struct inode *i = vma->vm_file->f_path.dentry->d_inode;
1358        unsigned long idx;
1359
1360        idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
1361        return mpol_shared_policy_lookup(&SHMEM_I(i)->policy, idx);
1362}
1363#endif
1364
1365int shmem_lock(struct file *file, int lock, struct user_struct *user)
1366{
1367        struct inode *inode = file->f_path.dentry->d_inode;
1368        struct shmem_inode_info *info = SHMEM_I(inode);
1369        int retval = -ENOMEM;
1370
1371        spin_lock(&info->lock);
1372        if (lock && !(info->flags & VM_LOCKED)) {
1373                if (!user_shm_lock(inode->i_size, user))
1374                        goto out_nomem;
1375                info->flags |= VM_LOCKED;
1376        }
1377        if (!lock && (info->flags & VM_LOCKED) && user) {
1378                user_shm_unlock(inode->i_size, user);
1379                info->flags &= ~VM_LOCKED;
1380        }
1381        retval = 0;
1382out_nomem:
1383        spin_unlock(&info->lock);
1384        return retval;
1385}
1386
1387static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
1388{
1389        file_accessed(file);
1390        vma->vm_ops = &shmem_vm_ops;
1391        vma->vm_flags |= VM_CAN_NONLINEAR;
1392        return 0;
1393}
1394
1395static struct inode *
1396shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
1397{
1398        struct inode *inode;
1399        struct shmem_inode_info *info;
1400        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
1401
1402        if (sbinfo->max_inodes) {
1403                spin_lock(&sbinfo->stat_lock);
1404                if (!sbinfo->free_inodes) {
1405                        spin_unlock(&sbinfo->stat_lock);
1406                        return NULL;
1407                }
1408                sbinfo->free_inodes--;
1409                spin_unlock(&sbinfo->stat_lock);
1410        }
1411
1412        inode = new_inode(sb);
1413        if (inode) {
1414                inode->i_mode = mode;
1415                inode->i_uid = current->fsuid;
1416                inode->i_gid = current->fsgid;
1417                inode->i_blocks = 0;
1418                inode->i_mapping->a_ops = &shmem_aops;
1419                inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
1420                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1421                inode->i_generation = get_seconds();
1422                info = SHMEM_I(inode);
1423                memset(info, 0, (char *)inode - (char *)info);
1424                spin_lock_init(&info->lock);
1425                INIT_LIST_HEAD(&info->swaplist);
1426
1427                switch (mode & S_IFMT) {
1428                default:
1429                        inode->i_op = &shmem_special_inode_operations;
1430                        init_special_inode(inode, mode, dev);
1431                        break;
1432                case S_IFREG:
1433                        inode->i_op = &shmem_inode_operations;
1434                        inode->i_fop = &shmem_file_operations;
1435                        mpol_shared_policy_init(&info->policy, sbinfo->policy,
1436                                                        &sbinfo->policy_nodes);
1437                        break;
1438                case S_IFDIR:
1439                        inc_nlink(inode);
1440                        /* Some things misbehave if size == 0 on a directory */
1441                        inode->i_size = 2 * BOGO_DIRENT_SIZE;
1442                        inode->i_op = &shmem_dir_inode_operations;
1443                        inode->i_fop = &simple_dir_operations;
1444                        break;
1445                case S_IFLNK:
1446                        /*
1447                         * Must not load anything in the rbtree,
1448                         * mpol_free_shared_policy will not be called.
1449                         */
1450                        mpol_shared_policy_init(&info->policy, MPOL_DEFAULT,
1451                                                NULL);
1452                        break;
1453                }
1454        } else if (sbinfo->max_inodes) {
1455                spin_lock(&sbinfo->stat_lock);
1456                sbinfo->free_inodes++;
1457                spin_unlock(&sbinfo->stat_lock);
1458        }
1459        return inode;
1460}
1461
1462#ifdef CONFIG_TMPFS
1463static const struct inode_operations shmem_symlink_inode_operations;
1464static const struct inode_operations shmem_symlink_inline_operations;
1465
1466/*
1467 * Normally tmpfs avoids the use of shmem_readpage and shmem_write_begin;
1468 * but providing them allows a tmpfs file to be used for splice, sendfile, and
1469 * below the loop driver, in the generic fashion that many filesystems support.
1470 */
1471static int shmem_readpage(struct file *file, struct page *page)
1472{
1473        struct inode *inode = page->mapping->host;
1474        int error = shmem_getpage(inode, page->index, &page, SGP_CACHE, NULL);
1475        unlock_page(page);
1476        return error;
1477}
1478
1479static int
1480shmem_write_begin(struct file *file, struct address_space *mapping,
1481                        loff_t pos, unsigned len, unsigned flags,
1482                        struct page **pagep, void **fsdata)
1483{
1484        struct inode *inode = mapping->host;
1485        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1486        *pagep = NULL;
1487        return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
1488}
1489
1490static int
1491shmem_write_end(struct file *file, struct address_space *mapping,
1492                        loff_t pos, unsigned len, unsigned copied,
1493                        struct page *page, void *fsdata)
1494{
1495        struct inode *inode = mapping->host;
1496
1497        set_page_dirty(page);
1498        page_cache_release(page);
1499
1500        if (pos+copied > inode->i_size)
1501                i_size_write(inode, pos+copied);
1502
1503        return copied;
1504}
1505
1506static ssize_t
1507shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
1508{
1509        struct inode    *inode = file->f_path.dentry->d_inode;
1510        loff_t          pos;
1511        unsigned long   written;
1512        ssize_t         err;
1513
1514        if ((ssize_t) count < 0)
1515                return -EINVAL;
1516
1517        if (!access_ok(VERIFY_READ, buf, count))
1518                return -EFAULT;
1519
1520        mutex_lock(&inode->i_mutex);
1521
1522        pos = *ppos;
1523        written = 0;
1524
1525        err = generic_write_checks(file, &pos, &count, 0);
1526        if (err || !count)
1527                goto out;
1528
1529        err = remove_suid(file->f_path.dentry);
1530        if (err)
1531                goto out;
1532
1533        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
1534
1535        do {
1536                struct page *page = NULL;
1537                unsigned long bytes, index, offset;
1538                char *kaddr;
1539                int left;
1540
1541                offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
1542                index = pos >> PAGE_CACHE_SHIFT;
1543                bytes = PAGE_CACHE_SIZE - offset;
1544                if (bytes > count)
1545                        bytes = count;
1546
1547                /*
1548                 * We don't hold page lock across copy from user -
1549                 * what would it guard against? - so no deadlock here.
1550                 * But it still may be a good idea to prefault below.
1551                 */
1552
1553                err = shmem_getpage(inode, index, &page, SGP_WRITE, NULL);
1554                if (err)
1555                        break;
1556
1557                left = bytes;
1558                if (PageHighMem(page)) {
1559                        volatile unsigned char dummy;
1560                        __get_user(dummy, buf);
1561                        __get_user(dummy, buf + bytes - 1);
1562
1563                        kaddr = kmap_atomic(page, KM_USER0);
1564                        left = __copy_from_user_inatomic(kaddr + offset,
1565                                                        buf, bytes);
1566                        kunmap_atomic(kaddr, KM_USER0);
1567                }
1568                if (left) {
1569                        kaddr = kmap(page);
1570                        left = __copy_from_user(kaddr + offset, buf, bytes);
1571                        kunmap(page);
1572                }
1573
1574                written += bytes;
1575                count -= bytes;
1576                pos += bytes;
1577                buf += bytes;
1578                if (pos > inode->i_size)
1579                        i_size_write(inode, pos);
1580
1581                flush_dcache_page(page);
1582                set_page_dirty(page);
1583                mark_page_accessed(page);
1584                page_cache_release(page);
1585
1586                if (left) {
1587                        pos -= left;
1588                        written -= left;
1589                        err = -EFAULT;
1590                        break;
1591                }
1592
1593                /*
1594                 * Our dirty pages are not counted in nr_dirty,
1595                 * and we do not attempt to balance dirty pages.
1596                 */
1597
1598                cond_resched();
1599        } while (count);
1600
1601        *ppos = pos;
1602        if (written)
1603                err = written;
1604out:
1605        mutex_unlock(&inode->i_mutex);
1606        return err;
1607}
1608
1609static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor)
1610{
1611        struct inode *inode = filp->f_path.dentry->d_inode;
1612        struct address_space *mapping = inode->i_mapping;
1613        unsigned long index, offset;
1614
1615        index = *ppos >> PAGE_CACHE_SHIFT;
1616        offset = *ppos & ~PAGE_CACHE_MASK;
1617
1618        for (;;) {
1619                struct page *page = NULL;
1620                unsigned long end_index, nr, ret;
1621                loff_t i_size = i_size_read(inode);
1622
1623                end_index = i_size >> PAGE_CACHE_SHIFT;
1624                if (index > end_index)
1625                        break;
1626                if (index == end_index) {
1627                        nr = i_size & ~PAGE_CACHE_MASK;
1628                        if (nr <= offset)
1629                                break;
1630                }
1631
1632                desc->error = shmem_getpage(inode, index, &page, SGP_READ, NULL);
1633                if (desc->error) {
1634                        if (desc->error == -EINVAL)
1635                                desc->error = 0;
1636                        break;
1637                }
1638
1639                /*
1640                 * We must evaluate after, since reads (unlike writes)
1641                 * are called without i_mutex protection against truncate
1642                 */
1643                nr = PAGE_CACHE_SIZE;
1644                i_size = i_size_read(inode);
1645                end_index = i_size >> PAGE_CACHE_SHIFT;
1646                if (index == end_index) {
1647                        nr = i_size & ~PAGE_CACHE_MASK;
1648                        if (nr <= offset) {
1649                                if (page)
1650                                        page_cache_release(page);
1651                                break;
1652                        }
1653                }
1654                nr -= offset;
1655
1656                if (page) {
1657                        /*
1658                         * If users can be writing to this page using arbitrary
1659                         * virtual addresses, take care about potential aliasing
1660                         * before reading the page on the kernel side.
1661                         */
1662                        if (mapping_writably_mapped(mapping))
1663                                flush_dcache_page(page);
1664                        /*
1665                         * Mark the page accessed if we read the beginning.
1666                         */
1667                        if (!offset)
1668                                mark_page_accessed(page);
1669                } else {
1670                        page = ZERO_PAGE(0);
1671                        page_cache_get(page);
1672                }
1673
1674                /*
1675                 * Ok, we have the page, and it's up-to-date, so
1676                 * now we can copy it to user space...
1677                 *
1678                 * The actor routine returns how many bytes were actually used..
1679                 * NOTE! This may not be the same as how much of a user buffer
1680                 * we filled up (we may be padding etc), so we can only update
1681                 * "pos" here (the actor routine has to update the user buffer
1682                 * pointers and the remaining count).
1683                 */
1684                ret = actor(desc, page, offset, nr);
1685                offset += ret;
1686                index += offset >> PAGE_CACHE_SHIFT;
1687                offset &= ~PAGE_CACHE_MASK;
1688
1689                page_cache_release(page);
1690                if (ret != nr || !desc->count)
1691                        break;
1692
1693                cond_resched();
1694        }
1695
1696        *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
1697        file_accessed(filp);
1698}
1699
1700static ssize_t shmem_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
1701{
1702        read_descriptor_t desc;
1703
1704        if ((ssize_t) count < 0)
1705                return -EINVAL;
1706        if (!access_ok(VERIFY_WRITE, buf, count))
1707                return -EFAULT;
1708        if (!count)
1709                return 0;
1710
1711        desc.written = 0;
1712        desc.count = count;
1713        desc.arg.buf = buf;
1714        desc.error = 0;
1715
1716        do_shmem_file_read(filp, ppos, &desc, file_read_actor);
1717        if (desc.written)
1718                return desc.written;
1719        return desc.error;
1720}
1721
1722static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
1723{
1724        struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
1725
1726        buf->f_type = TMPFS_MAGIC;
1727        buf->f_bsize = PAGE_CACHE_SIZE;
1728        buf->f_namelen = NAME_MAX;
1729        spin_lock(&sbinfo->stat_lock);
1730        if (sbinfo->max_blocks) {
1731                buf->f_blocks = sbinfo->max_blocks;
1732                buf->f_bavail = buf->f_bfree = sbinfo->free_blocks;
1733        }
1734        if (sbinfo->max_inodes) {
1735                buf->f_files = sbinfo->max_inodes;
1736                buf->f_ffree = sbinfo->free_inodes;
1737        }
1738        /* else leave those fields 0 like simple_statfs */
1739        spin_unlock(&sbinfo->stat_lock);
1740        return 0;
1741}
1742
1743/*
1744 * File creation. Allocate an inode, and we're done..
1745 */
1746static int
1747shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
1748{
1749        struct inode *inode = shmem_get_inode(dir->i_sb, mode, dev);
1750        int error = -ENOSPC;
1751
1752        if (inode) {
1753                error = security_inode_init_security(inode, dir, NULL, NULL,
1754                                                     NULL);
1755                if (error) {
1756                        if (error != -EOPNOTSUPP) {
1757                                iput(inode);
1758                                return error;
1759                        }
1760                }
1761                error = shmem_acl_init(inode, dir);
1762                if (error) {
1763                        iput(inode);
1764                        return error;
1765                }
1766                if (dir->i_mode & S_ISGID) {
1767                        inode->i_gid = dir->i_gid;
1768                        if (S_ISDIR(mode))
1769                                inode->i_mode |= S_ISGID;
1770                }
1771                dir->i_size += BOGO_DIRENT_SIZE;
1772                dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1773                d_instantiate(dentry, inode);
1774                dget(dentry); /* Extra count - pin the dentry in core */
1775        }
1776        return error;
1777}
1778
1779static int shmem_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1780{
1781        int error;
1782
1783        if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0)))
1784                return error;
1785        inc_nlink(dir);
1786        return 0;
1787}
1788
1789static int shmem_create(struct inode *dir, struct dentry *dentry, int mode,
1790                struct nameidata *nd)
1791{
1792        return shmem_mknod(dir, dentry, mode | S_IFREG, 0);
1793}
1794
1795/*
1796 * Link a file..
1797 */
1798static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
1799{
1800        struct inode *inode = old_dentry->d_inode;
1801        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
1802
1803        /*
1804         * No ordinary (disk based) filesystem counts links as inodes;
1805         * but each new link needs a new dentry, pinning lowmem, and
1806         * tmpfs dentries cannot be pruned until they are unlinked.
1807         */
1808        if (sbinfo->max_inodes) {
1809                spin_lock(&sbinfo->stat_lock);
1810                if (!sbinfo->free_inodes) {
1811                        spin_unlock(&sbinfo->stat_lock);
1812                        return -ENOSPC;
1813                }
1814                sbinfo->free_inodes--;
1815                spin_unlock(&sbinfo->stat_lock);
1816        }
1817
1818        dir->i_size += BOGO_DIRENT_SIZE;
1819        inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1820        inc_nlink(inode);
1821        atomic_inc(&inode->i_count);    /* New dentry reference */
1822        dget(dentry);           /* Extra pinning count for the created dentry */
1823        d_instantiate(dentry, inode);
1824        return 0;
1825}
1826
1827static int shmem_unlink(struct inode *dir, struct dentry *dentry)
1828{
1829        struct inode *inode = dentry->d_inode;
1830
1831        if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) {
1832                struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
1833                if (sbinfo->max_inodes) {
1834                        spin_lock(&sbinfo->stat_lock);
1835                        sbinfo->free_inodes++;
1836                        spin_unlock(&sbinfo->stat_lock);
1837                }
1838        }
1839
1840        dir->i_size -= BOGO_DIRENT_SIZE;
1841        inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1842        drop_nlink(inode);
1843        dput(dentry);   /* Undo the count from "create" - this does all the work */
1844        return 0;
1845}
1846
1847static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
1848{
1849        if (!simple_empty(dentry))
1850                return -ENOTEMPTY;
1851
1852        drop_nlink(dentry->d_inode);
1853        drop_nlink(dir);
1854        return shmem_unlink(dir, dentry);
1855}
1856
1857/*
1858 * The VFS layer already does all the dentry stuff for rename,
1859 * we just have to decrement the usage count for the target if
1860 * it exists so that the VFS layer correctly free's it when it
1861 * gets overwritten.
1862 */
1863static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry)
1864{
1865        struct inode *inode = old_dentry->d_inode;
1866        int they_are_dirs = S_ISDIR(inode->i_mode);
1867
1868        if (!simple_empty(new_dentry))
1869                return -ENOTEMPTY;
1870
1871        if (new_dentry->d_inode) {
1872                (void) shmem_unlink(new_dir, new_dentry);
1873                if (they_are_dirs)
1874                        drop_nlink(old_dir);
1875        } else if (they_are_dirs) {
1876                drop_nlink(old_dir);
1877                inc_nlink(new_dir);
1878        }
1879
1880        old_dir->i_size -= BOGO_DIRENT_SIZE;
1881        new_dir->i_size += BOGO_DIRENT_SIZE;
1882        old_dir->i_ctime = old_dir->i_mtime =
1883        new_dir->i_ctime = new_dir->i_mtime =
1884        inode->i_ctime = CURRENT_TIME;
1885        return 0;
1886}
1887
1888static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
1889{
1890        int error;
1891        int len;
1892        struct inode *inode;
1893        struct page *page = NULL;
1894        char *kaddr;
1895        struct shmem_inode_info *info;
1896
1897        len = strlen(symname) + 1;
1898        if (len > PAGE_CACHE_SIZE)
1899                return -ENAMETOOLONG;
1900
1901        inode = shmem_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0);
1902        if (!inode)
1903                return -ENOSPC;
1904
1905        error = security_inode_init_security(inode, dir, NULL, NULL,
1906                                             NULL);
1907        if (error) {
1908                if (error != -EOPNOTSUPP) {
1909                        iput(inode);
1910                        return error;
1911                }
1912                error = 0;
1913        }
1914
1915        info = SHMEM_I(inode);
1916        inode->i_size = len-1;
1917        if (len <= (char *)inode - (char *)info) {
1918                /* do it inline */
1919                memcpy(info, symname, len);
1920                inode->i_op = &shmem_symlink_inline_operations;
1921        } else {
1922                error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL);
1923                if (error) {
1924                        iput(inode);
1925                        return error;
1926                }
1927                inode->i_op = &shmem_symlink_inode_operations;
1928                kaddr = kmap_atomic(page, KM_USER0);
1929                memcpy(kaddr, symname, len);
1930                kunmap_atomic(kaddr, KM_USER0);
1931                set_page_dirty(page);
1932                page_cache_release(page);
1933        }
1934        if (dir->i_mode & S_ISGID)
1935                inode->i_gid = dir->i_gid;
1936        dir->i_size += BOGO_DIRENT_SIZE;
1937        dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1938        d_instantiate(dentry, inode);
1939        dget(dentry);
1940        return 0;
1941}
1942
1943static void *shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd)
1944{
1945        nd_set_link(nd, (char *)SHMEM_I(dentry->d_inode));
1946        return NULL;
1947}
1948
1949static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd)
1950{
1951        struct page *page = NULL;
1952        int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL);
1953        nd_set_link(nd, res ? ERR_PTR(res) : kmap(page));
1954        return page;
1955}
1956
1957static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
1958{
1959        if (!IS_ERR(nd_get_link(nd))) {
1960                struct page *page = cookie;
1961                kunmap(page);
1962                mark_page_accessed(page);
1963                page_cache_release(page);
1964        }
1965}
1966
1967static const struct inode_operations shmem_symlink_inline_operations = {
1968        .readlink       = generic_readlink,
1969        .follow_link    = shmem_follow_link_inline,
1970};
1971
1972static const struct inode_operations shmem_symlink_inode_operations = {
1973        .truncate       = shmem_truncate,
1974        .readlink       = generic_readlink,
1975        .follow_link    = shmem_follow_link,
1976        .put_link       = shmem_put_link,
1977};
1978
1979#ifdef CONFIG_TMPFS_POSIX_ACL
1980/**
1981 * Superblocks without xattr inode operations will get security.* xattr
1982 * support from the VFS "for free". As soon as we have any other xattrs
1983 * like ACLs, we also need to implement the security.* handlers at
1984 * filesystem level, though.
1985 */
1986
1987static size_t shmem_xattr_security_list(struct inode *inode, char *list,
1988                                        size_t list_len, const char *name,
1989                                        size_t name_len)
1990{
1991        return security_inode_listsecurity(inode, list, list_len);
1992}
1993
1994static int shmem_xattr_security_get(struct inode *inode, const char *name,
1995                                    void *buffer, size_t size)
1996{
1997        if (strcmp(name, "") == 0)
1998                return -EINVAL;
1999        return security_inode_getsecurity(inode, name, buffer, size,
2000                                          -EOPNOTSUPP);
2001}
2002
2003static int shmem_xattr_security_set(struct inode *inode, const char *name,
2004                                    const void *value, size_t size, int flags)
2005{
2006        if (strcmp(name, "") == 0)
2007                return -EINVAL;
2008        return security_inode_setsecurity(inode, name, value, size, flags);
2009}
2010
2011static struct xattr_handler shmem_xattr_security_handler = {
2012        .prefix = XATTR_SECURITY_PREFIX,
2013        .list   = shmem_xattr_security_list,
2014        .get    = shmem_xattr_security_get,
2015        .set    = shmem_xattr_security_set,
2016};
2017
2018static struct xattr_handler *shmem_xattr_handlers[] = {
2019        &shmem_xattr_acl_access_handler,
2020        &shmem_xattr_acl_default_handler,
2021        &shmem_xattr_security_handler,
2022        NULL
2023};
2024#endif
2025
2026static struct dentry *shmem_get_parent(struct dentry *child)
2027{
2028        return ERR_PTR(-ESTALE);
2029}
2030
2031static int shmem_match(struct inode *ino, void *vfh)
2032{
2033        __u32 *fh = vfh;
2034        __u64 inum = fh[2];
2035        inum = (inum << 32) | fh[1];
2036        return ino->i_ino == inum && fh[0] == ino->i_generation;
2037}
2038
2039static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
2040                struct fid *fid, int fh_len, int fh_type)
2041{
2042        struct inode *inode;
2043        struct dentry *dentry = NULL;
2044        u64 inum = fid->raw[2];
2045        inum = (inum << 32) | fid->raw[1];
2046
2047        if (fh_len < 3)
2048                return NULL;
2049
2050        inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]),
2051                        shmem_match, fid->raw);
2052        if (inode) {
2053                dentry = d_find_alias(inode);
2054                iput(inode);
2055        }
2056
2057        return dentry;
2058}
2059
2060static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
2061                                int connectable)
2062{
2063        struct inode *inode = dentry->d_inode;
2064
2065        if (*len < 3)
2066                return 255;
2067
2068        if (hlist_unhashed(&inode->i_hash)) {
2069                /* Unfortunately insert_inode_hash is not idempotent,
2070                 * so as we hash inodes here rather than at creation
2071                 * time, we need a lock to ensure we only try
2072                 * to do it once
2073                 */
2074                static DEFINE_SPINLOCK(lock);
2075                spin_lock(&lock);
2076                if (hlist_unhashed(&inode->i_hash))
2077                        __insert_inode_hash(inode,
2078                                            inode->i_ino + inode->i_generation);
2079                spin_unlock(&lock);
2080        }
2081
2082        fh[0] = inode->i_generation;
2083        fh[1] = inode->i_ino;
2084        fh[2] = ((__u64)inode->i_ino) >> 32;
2085
2086        *len = 3;
2087        return 1;
2088}
2089
2090static const struct export_operations shmem_export_ops = {
2091        .get_parent     = shmem_get_parent,
2092        .encode_fh      = shmem_encode_fh,
2093        .fh_to_dentry   = shmem_fh_to_dentry,
2094};
2095
2096static int shmem_parse_options(char *options, int *mode, uid_t *uid,
2097        gid_t *gid, unsigned long *blocks, unsigned long *inodes,
2098        int *policy, nodemask_t *policy_nodes)
2099{
2100        char *this_char, *value, *rest;
2101
2102        while (options != NULL) {
2103                this_char = options;
2104                for (;;) {
2105                        /*
2106                         * NUL-terminate this option: unfortunately,
2107                         * mount options form a comma-separated list,
2108                         * but mpol's nodelist may also contain commas.
2109                         */
2110                        options = strchr(options, ',');
2111                        if (options == NULL)
2112                                break;
2113                        options++;
2114                        if (!isdigit(*options)) {
2115                                options[-1] = '\0';
2116                                break;
2117                        }
2118                }
2119                if (!*this_char)
2120                        continue;
2121                if ((value = strchr(this_char,'=')) != NULL) {
2122                        *value++ = 0;
2123                } else {
2124                        printk(KERN_ERR
2125                            "tmpfs: No value for mount option '%s'\n",
2126                            this_char);
2127                        return 1;
2128                }
2129
2130                if (!strcmp(this_char,"size")) {
2131                        unsigned long long size;
2132                        size = memparse(value,&rest);
2133                        if (*rest == '%') {
2134                                size <<= PAGE_SHIFT;
2135                                size *= totalram_pages;
2136                                do_div(size, 100);
2137                                rest++;
2138                        }
2139                        if (*rest)
2140                                goto bad_val;
2141                        *blocks = size >> PAGE_CACHE_SHIFT;
2142                } else if (!strcmp(this_char,"nr_blocks")) {
2143                        *blocks = memparse(value,&rest);
2144                        if (*rest)
2145                                goto bad_val;
2146                } else if (!strcmp(this_char,"nr_inodes")) {
2147                        *inodes = memparse(value,&rest);
2148                        if (*rest)
2149                                goto bad_val;
2150                } else if (!strcmp(this_char,"mode")) {
2151                        if (!mode)
2152                                continue;
2153                        *mode = simple_strtoul(value,&rest,8);
2154                        if (*rest)
2155                                goto bad_val;
2156                } else if (!strcmp(this_char,"uid")) {
2157                        if (!uid)
2158                                continue;
2159                        *uid = simple_strtoul(value,&rest,0);
2160                        if (*rest)
2161                                goto bad_val;
2162                } else if (!strcmp(this_char,"gid")) {
2163                        if (!gid)
2164                                continue;
2165                        *gid = simple_strtoul(value,&rest,0);
2166                        if (*rest)
2167                                goto bad_val;
2168                } else if (!strcmp(this_char,"mpol")) {
2169                        if (shmem_parse_mpol(value,policy,policy_nodes))
2170                                goto bad_val;
2171                } else {
2172                        printk(KERN_ERR "tmpfs: Bad mount option %s\n",
2173                               this_char);
2174                        return 1;
2175                }
2176        }
2177        return 0;
2178
2179bad_val:
2180        printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n",
2181               value, this_char);
2182        return 1;
2183
2184}
2185
2186static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
2187{
2188        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
2189        unsigned long max_blocks = sbinfo->max_blocks;
2190        unsigned long max_inodes = sbinfo->max_inodes;
2191        int policy = sbinfo->policy;
2192        nodemask_t policy_nodes = sbinfo->policy_nodes;
2193        unsigned long blocks;
2194        unsigned long inodes;
2195        int error = -EINVAL;
2196
2197        if (shmem_parse_options(data, NULL, NULL, NULL, &max_blocks,
2198                                &max_inodes, &policy, &policy_nodes))
2199                return error;
2200
2201        spin_lock(&sbinfo->stat_lock);
2202        blocks = sbinfo->max_blocks - sbinfo->free_blocks;
2203        inodes = sbinfo->max_inodes - sbinfo->free_inodes;
2204        if (max_blocks < blocks)
2205                goto out;
2206        if (max_inodes < inodes)
2207                goto out;
2208        /*
2209         * Those tests also disallow limited->unlimited while any are in
2210         * use, so i_blocks will always be zero when max_blocks is zero;
2211         * but we must separately disallow unlimited->limited, because
2212         * in that case we have no record of how much is already in use.
2213         */
2214        if (max_blocks && !sbinfo->max_blocks)
2215                goto out;
2216        if (max_inodes && !sbinfo->max_inodes)
2217                goto out;
2218
2219        error = 0;
2220        sbinfo->max_blocks  = max_blocks;
2221        sbinfo->free_blocks = max_blocks - blocks;
2222        sbinfo->max_inodes  = max_inodes;
2223        sbinfo->free_inodes = max_inodes - inodes;
2224        sbinfo->policy = policy;
2225        sbinfo->policy_nodes = policy_nodes;
2226out:
2227        spin_unlock(&sbinfo->stat_lock);
2228        return error;
2229}
2230#endif
2231
2232static void shmem_put_super(struct super_block *sb)
2233{
2234        kfree(sb->s_fs_info);
2235        sb->s_fs_info = NULL;
2236}
2237
2238static int shmem_fill_super(struct super_block *sb,
2239                            void *data, int silent)
2240{
2241        struct inode *inode;
2242        struct dentry *root;
2243        int mode   = S_IRWXUGO | S_ISVTX;
2244        uid_t uid = current->fsuid;
2245        gid_t gid = current->fsgid;
2246        int err = -ENOMEM;
2247        struct shmem_sb_info *sbinfo;
2248        unsigned long blocks = 0;
2249        unsigned long inodes = 0;
2250        int policy = MPOL_DEFAULT;
2251        nodemask_t policy_nodes = node_states[N_HIGH_MEMORY];
2252
2253#ifdef CONFIG_TMPFS
2254        /*
2255         * Per default we only allow half of the physical ram per
2256         * tmpfs instance, limiting inodes to one per page of lowmem;
2257         * but the internal instance is left unlimited.
2258         */
2259        if (!(sb->s_flags & MS_NOUSER)) {
2260                blocks = totalram_pages / 2;
2261                inodes = totalram_pages - totalhigh_pages;
2262                if (inodes > blocks)
2263                        inodes = blocks;
2264                if (shmem_parse_options(data, &mode, &uid, &gid, &blocks,
2265                                        &inodes, &policy, &policy_nodes))
2266                        return -EINVAL;
2267        }
2268        sb->s_export_op = &shmem_export_ops;
2269#else
2270        sb->s_flags |= MS_NOUSER;
2271#endif
2272
2273        /* Round up to L1_CACHE_BYTES to resist false sharing */
2274        sbinfo = kmalloc(max((int)sizeof(struct shmem_sb_info),
2275                                L1_CACHE_BYTES), GFP_KERNEL);
2276        if (!sbinfo)
2277                return -ENOMEM;
2278
2279        spin_lock_init(&sbinfo->stat_lock);
2280        sbinfo->max_blocks = blocks;
2281        sbinfo->free_blocks = blocks;
2282        sbinfo->max_inodes = inodes;
2283        sbinfo->free_inodes = inodes;
2284        sbinfo->policy = policy;
2285        sbinfo->policy_nodes = policy_nodes;
2286
2287        sb->s_fs_info = sbinfo;
2288        sb->s_maxbytes = SHMEM_MAX_BYTES;
2289        sb->s_blocksize = PAGE_CACHE_SIZE;
2290        sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
2291        sb->s_magic = TMPFS_MAGIC;
2292        sb->s_op = &shmem_ops;
2293        sb->s_time_gran = 1;
2294#ifdef CONFIG_TMPFS_POSIX_ACL
2295        sb->s_xattr = shmem_xattr_handlers;
2296        sb->s_flags |= MS_POSIXACL;
2297#endif
2298
2299        inode = shmem_get_inode(sb, S_IFDIR | mode, 0);
2300        if (!inode)
2301                goto failed;
2302        inode->i_uid = uid;
2303        inode->i_gid = gid;
2304        root = d_alloc_root(inode);
2305        if (!root)
2306                goto failed_iput;
2307        sb->s_root = root;
2308        return 0;
2309
2310failed_iput:
2311        iput(inode);
2312failed:
2313        shmem_put_super(sb);
2314        return err;
2315}
2316
2317static struct kmem_cache *shmem_inode_cachep;
2318
2319static struct inode *shmem_alloc_inode(struct super_block *sb)
2320{
2321        struct shmem_inode_info *p;
2322        p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
2323        if (!p)
2324                return NULL;
2325        return &p->vfs_inode;
2326}
2327
2328static void shmem_destroy_inode(struct inode *inode)
2329{
2330        if ((inode->i_mode & S_IFMT) == S_IFREG) {
2331                /* only struct inode is valid if it's an inline symlink */
2332                mpol_free_shared_policy(&SHMEM_I(inode)->policy);
2333        }
2334        shmem_acl_destroy_inode(inode);
2335        kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
2336}
2337
2338static void init_once(struct kmem_cache *cachep, void *foo)
2339{
2340        struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
2341
2342        inode_init_once(&p->vfs_inode);
2343#ifdef CONFIG_TMPFS_POSIX_ACL
2344        p->i_acl = NULL;
2345        p->i_default_acl = NULL;
2346#endif
2347}
2348
2349static int init_inodecache(void)
2350{
2351        shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
2352                                sizeof(struct shmem_inode_info),
2353                                0, SLAB_PANIC, init_once);
2354        return 0;
2355}
2356
2357static void destroy_inodecache(void)
2358{
2359        kmem_cache_destroy(shmem_inode_cachep);
2360}
2361
2362static const struct address_space_operations shmem_aops = {
2363        .writepage      = shmem_writepage,
2364        .set_page_dirty = __set_page_dirty_no_writeback,
2365#ifdef CONFIG_TMPFS
2366        .readpage       = shmem_readpage,
2367        .write_begin    = shmem_write_begin,
2368        .write_end      = shmem_write_end,
2369#endif
2370        .migratepage    = migrate_page,
2371};
2372
2373static const struct file_operations shmem_file_operations = {
2374        .mmap           = shmem_mmap,
2375#ifdef CONFIG_TMPFS
2376        .llseek         = generic_file_llseek,
2377        .read           = shmem_file_read,
2378        .write          = shmem_file_write,
2379        .fsync          = simple_sync_file,
2380        .splice_read    = generic_file_splice_read,
2381        .splice_write   = generic_file_splice_write,
2382#endif
2383};
2384
2385static const struct inode_operations shmem_inode_operations = {
2386        .truncate       = shmem_truncate,
2387        .setattr        = shmem_notify_change,
2388        .truncate_range = shmem_truncate_range,
2389#ifdef CONFIG_TMPFS_POSIX_ACL
2390        .setxattr       = generic_setxattr,
2391        .getxattr       = generic_getxattr,
2392        .listxattr      = generic_listxattr,
2393        .removexattr    = generic_removexattr,
2394        .permission     = shmem_permission,
2395#endif
2396
2397};
2398
2399static const struct inode_operations shmem_dir_inode_operations = {
2400#ifdef CONFIG_TMPFS
2401        .create         = shmem_create,
2402        .lookup         = simple_lookup,
2403        .link           = shmem_link,
2404        .unlink         = shmem_unlink,
2405        .symlink        = shmem_symlink,
2406        .mkdir          = shmem_mkdir,
2407        .rmdir          = shmem_rmdir,
2408        .mknod          = shmem_mknod,
2409        .rename         = shmem_rename,
2410#endif
2411#ifdef CONFIG_TMPFS_POSIX_ACL
2412        .setattr        = shmem_notify_change,
2413        .setxattr       = generic_setxattr,
2414        .getxattr       = generic_getxattr,
2415        .listxattr      = generic_listxattr,
2416        .removexattr    = generic_removexattr,
2417        .permission     = shmem_permission,
2418#endif
2419};
2420
2421static const struct inode_operations shmem_special_inode_operations = {
2422#ifdef CONFIG_TMPFS_POSIX_ACL
2423        .setattr        = shmem_notify_change,
2424        .setxattr       = generic_setxattr,
2425        .getxattr       = generic_getxattr,
2426        .listxattr      = generic_listxattr,
2427        .removexattr    = generic_removexattr,
2428        .permission     = shmem_permission,
2429#endif
2430};
2431
2432static const struct super_operations shmem_ops = {
2433        .alloc_inode    = shmem_alloc_inode,
2434        .destroy_inode  = shmem_destroy_inode,
2435#ifdef CONFIG_TMPFS
2436        .statfs         = shmem_statfs,
2437        .remount_fs     = shmem_remount_fs,
2438#endif
2439        .delete_inode   = shmem_delete_inode,
2440        .drop_inode     = generic_delete_inode,
2441        .put_super      = shmem_put_super,
2442};
2443
2444static struct vm_operations_struct shmem_vm_ops = {
2445        .fault          = shmem_fault,
2446#ifdef CONFIG_NUMA
2447        .set_policy     = shmem_set_policy,
2448        .get_policy     = shmem_get_policy,
2449#endif
2450};
2451
2452
2453static int shmem_get_sb(struct file_system_type *fs_type,
2454        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
2455{
2456        return get_sb_nodev(fs_type, flags, data, shmem_fill_super, mnt);
2457}
2458
2459static struct file_system_type tmpfs_fs_type = {
2460        .owner          = THIS_MODULE,
2461        .name           = "tmpfs",
2462        .get_sb         = shmem_get_sb,
2463        .kill_sb        = kill_litter_super,
2464};
2465static struct vfsmount *shm_mnt;
2466
2467static int __init init_tmpfs(void)
2468{
2469        int error;
2470
2471        error = bdi_init(&shmem_backing_dev_info);
2472        if (error)
2473                goto out4;
2474
2475        error = init_inodecache();
2476        if (error)
2477                goto out3;
2478
2479        error = register_filesystem(&tmpfs_fs_type);
2480        if (error) {
2481                printk(KERN_ERR "Could not register tmpfs\n");
2482                goto out2;
2483        }
2484
2485        shm_mnt = vfs_kern_mount(&tmpfs_fs_type, MS_NOUSER,
2486                                tmpfs_fs_type.name, NULL);
2487        if (IS_ERR(shm_mnt)) {
2488                error = PTR_ERR(shm_mnt);
2489                printk(KERN_ERR "Could not kern_mount tmpfs\n");
2490                goto out1;
2491        }
2492        return 0;
2493
2494out1:
2495        unregister_filesystem(&tmpfs_fs_type);
2496out2:
2497        destroy_inodecache();
2498out3:
2499        bdi_destroy(&shmem_backing_dev_info);
2500out4:
2501        shm_mnt = ERR_PTR(error);
2502        return error;
2503}
2504module_init(init_tmpfs)
2505
2506/*
2507 * shmem_file_setup - get an unlinked file living in tmpfs
2508 *
2509 * @name: name for dentry (to be seen in /proc/<pid>/maps
2510 * @size: size to be set for the file
2511 *
2512 */
2513struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
2514{
2515        int error;
2516        struct file *file;
2517        struct inode *inode;
2518        struct dentry *dentry, *root;
2519        struct qstr this;
2520
2521        if (IS_ERR(shm_mnt))
2522                return (void *)shm_mnt;
2523
2524        if (size < 0 || size > SHMEM_MAX_BYTES)
2525                return ERR_PTR(-EINVAL);
2526
2527        if (shmem_acct_size(flags, size))
2528                return ERR_PTR(-ENOMEM);
2529
2530        error = -ENOMEM;
2531        this.name = name;
2532        this.len = strlen(name);
2533        this.hash = 0; /* will go */
2534        root = shm_mnt->mnt_root;
2535        dentry = d_alloc(root, &this);
2536        if (!dentry)
2537                goto put_memory;
2538
2539        error = -ENFILE;
2540        file = get_empty_filp();
2541        if (!file)
2542                goto put_dentry;
2543
2544        error = -ENOSPC;
2545        inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0);
2546        if (!inode)
2547                goto close_file;
2548
2549        SHMEM_I(inode)->flags = flags & VM_ACCOUNT;
2550        d_instantiate(dentry, inode);
2551        inode->i_size = size;
2552        inode->i_nlink = 0;     /* It is unlinked */
2553        init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
2554                        &shmem_file_operations);
2555        return file;
2556
2557close_file:
2558        put_filp(file);
2559put_dentry:
2560        dput(dentry);
2561put_memory:
2562        shmem_unacct_size(flags, size);
2563        return ERR_PTR(error);
2564}
2565
2566/*
2567 * shmem_zero_setup - setup a shared anonymous mapping
2568 *
2569 * @vma: the vma to be mmapped is prepared by do_mmap_pgoff
2570 */
2571int shmem_zero_setup(struct vm_area_struct *vma)
2572{
2573        struct file *file;
2574        loff_t size = vma->vm_end - vma->vm_start;
2575
2576        file = shmem_file_setup("dev/zero", size, vma->vm_flags);
2577        if (IS_ERR(file))
2578                return PTR_ERR(file);
2579
2580        if (vma->vm_file)
2581                fput(vma->vm_file);
2582        vma->vm_file = file;
2583        vma->vm_ops = &shmem_vm_ops;
2584        return 0;
2585}
2586