LXR linux/mm/shmem.c

   1/*
   2 * Resizable virtual memory filesystem for Linux.
   3 *
   4 * Copyright (C) 2000 Linus Torvalds.
   5 *               2000 Transmeta Corp.
   6 *               2000-2001 Christoph Rohland
   7 *               2000-2001 SAP AG
   8 *               2002 Red Hat Inc.
   9 * Copyright (C) 2002-2011 Hugh Dickins.
  10 * Copyright (C) 2011 Google Inc.
  11 * Copyright (C) 2002-2005 VERITAS Software Corporation.
  12 * Copyright (C) 2004 Andi Kleen, SuSE Labs
  13 *
  14 * Extended attribute support for tmpfs:
  15 * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net>
  16 * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
  17 *
  18 * tiny-shmem:
  19 * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com>
  20 *
  21 * This file is released under the GPL.
  22 */
  23
  24#include <linux/fs.h>
  25#include <linux/init.h>
  26#include <linux/vfs.h>
  27#include <linux/mount.h>
  28#include <linux/ramfs.h>
  29#include <linux/pagemap.h>
  30#include <linux/file.h>
  31#include <linux/mm.h>
  32#include <linux/export.h>
  33#include <linux/swap.h>
  34#include <linux/uio.h>
  35
  36static struct vfsmount *shm_mnt;
  37
  38#ifdef CONFIG_SHMEM
  39/*
  40 * This virtual memory filesystem is heavily based on the ramfs. It
  41 * extends ramfs by the ability to use swap and honor resource limits
  42 * which makes it a completely usable filesystem.
  43 */
  44
  45#include <linux/xattr.h>
  46#include <linux/exportfs.h>
  47#include <linux/posix_acl.h>
  48#include <linux/posix_acl_xattr.h>
  49#include <linux/mman.h>
  50#include <linux/string.h>
  51#include <linux/slab.h>
  52#include <linux/backing-dev.h>
  53#include <linux/shmem_fs.h>
  54#include <linux/writeback.h>
  55#include <linux/blkdev.h>
  56#include <linux/pagevec.h>
  57#include <linux/percpu_counter.h>
  58#include <linux/falloc.h>
  59#include <linux/splice.h>
  60#include <linux/security.h>
  61#include <linux/swapops.h>
  62#include <linux/mempolicy.h>
  63#include <linux/namei.h>
  64#include <linux/ctype.h>
  65#include <linux/migrate.h>
  66#include <linux/highmem.h>
  67#include <linux/seq_file.h>
  68#include <linux/magic.h>
  69#include <linux/syscalls.h>
  70#include <linux/fcntl.h>
  71#include <uapi/linux/memfd.h>
  72
  73#include <asm/uaccess.h>
  74#include <asm/pgtable.h>
  75
  76#define BLOCKS_PER_PAGE  (PAGE_CACHE_SIZE/512)
  77#define VM_ACCT(size)    (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT)
  78
  79/* Pretend that each entry is of this size in directory's i_size */
  80#define BOGO_DIRENT_SIZE 20
  81
  82/* Symlink up to this size is kmalloc'ed instead of using a swappable page */
  83#define SHORT_SYMLINK_LEN 128
  84
  85/*
  86 * shmem_fallocate communicates with shmem_fault or shmem_writepage via
  87 * inode->i_private (with i_mutex making sure that it has only one user at
  88 * a time): we would prefer not to enlarge the shmem inode just for that.
  89 */
  90struct shmem_falloc {
  91        wait_queue_head_t *waitq; /* faults into hole wait for punch to end */
  92        pgoff_t start;          /* start of range currently being fallocated */
  93        pgoff_t next;           /* the next page offset to be fallocated */
  94        pgoff_t nr_falloced;    /* how many new pages have been fallocated */
  95        pgoff_t nr_unswapped;   /* how often writepage refused to swap out */
  96};
  97
  98/* Flag allocation requirements to shmem_getpage */
  99enum sgp_type {
 100        SGP_READ,       /* don't exceed i_size, don't allocate page */
 101        SGP_CACHE,      /* don't exceed i_size, may allocate page */
 102        SGP_DIRTY,      /* like SGP_CACHE, but set new page dirty */
 103        SGP_WRITE,      /* may exceed i_size, may allocate !Uptodate page */
 104        SGP_FALLOC,     /* like SGP_WRITE, but make existing page Uptodate */
 105};
 106
 107#ifdef CONFIG_TMPFS
 108static unsigned long shmem_default_max_blocks(void)
 109{
 110        return totalram_pages / 2;
 111}
 112
 113static unsigned long shmem_default_max_inodes(void)
 114{
 115        return min(totalram_pages - totalhigh_pages, totalram_pages / 2);
 116}
 117#endif
 118
 119static bool shmem_should_replace_page(struct page *page, gfp_t gfp);
 120static int shmem_replace_page(struct page **pagep, gfp_t gfp,
 121                                struct shmem_inode_info *info, pgoff_t index);
 122static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
 123        struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type);
 124
 125static inline int shmem_getpage(struct inode *inode, pgoff_t index,
 126        struct page **pagep, enum sgp_type sgp, int *fault_type)
 127{
 128        return shmem_getpage_gfp(inode, index, pagep, sgp,
 129                        mapping_gfp_mask(inode->i_mapping), fault_type);
 130}
 131
 132static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
 133{
 134        return sb->s_fs_info;
 135}
 136
 137/*
 138 * shmem_file_setup pre-accounts the whole fixed size of a VM object,
 139 * for shared memory and for shared anonymous (/dev/zero) mappings
 140 * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1),
 141 * consistent with the pre-accounting of private mappings ...
 142 */
 143static inline int shmem_acct_size(unsigned long flags, loff_t size)
 144{
 145        return (flags & VM_NORESERVE) ?
 146                0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size));
 147}
 148
 149static inline void shmem_unacct_size(unsigned long flags, loff_t size)
 150{
 151        if (!(flags & VM_NORESERVE))
 152                vm_unacct_memory(VM_ACCT(size));
 153}
 154
 155static inline int shmem_reacct_size(unsigned long flags,
 156                loff_t oldsize, loff_t newsize)
 157{
 158        if (!(flags & VM_NORESERVE)) {
 159                if (VM_ACCT(newsize) > VM_ACCT(oldsize))
 160                        return security_vm_enough_memory_mm(current->mm,
 161                                        VM_ACCT(newsize) - VM_ACCT(oldsize));
 162                else if (VM_ACCT(newsize) < VM_ACCT(oldsize))
 163                        vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize));
 164        }
 165        return 0;
 166}
 167
 168/*
 169 * ... whereas tmpfs objects are accounted incrementally as
 170 * pages are allocated, in order to allow huge sparse files.
 171 * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
 172 * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
 173 */
 174static inline int shmem_acct_block(unsigned long flags)
 175{
 176        return (flags & VM_NORESERVE) ?
 177                security_vm_enough_memory_mm(current->mm, VM_ACCT(PAGE_CACHE_SIZE)) : 0;
 178}
 179
 180static inline void shmem_unacct_blocks(unsigned long flags, long pages)
 181{
 182        if (flags & VM_NORESERVE)
 183                vm_unacct_memory(pages * VM_ACCT(PAGE_CACHE_SIZE));
 184}
 185
 186static const struct super_operations shmem_ops;
 187static const struct address_space_operations shmem_aops;
 188static const struct file_operations shmem_file_operations;
 189static const struct inode_operations shmem_inode_operations;
 190static const struct inode_operations shmem_dir_inode_operations;
 191static const struct inode_operations shmem_special_inode_operations;
 192static const struct vm_operations_struct shmem_vm_ops;
 193
 194static LIST_HEAD(shmem_swaplist);
 195static DEFINE_MUTEX(shmem_swaplist_mutex);
 196
 197static int shmem_reserve_inode(struct super_block *sb)
 198{
 199        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
 200        if (sbinfo->max_inodes) {
 201                spin_lock(&sbinfo->stat_lock);
 202                if (!sbinfo->free_inodes) {
 203                        spin_unlock(&sbinfo->stat_lock);
 204                        return -ENOSPC;
 205                }
 206                sbinfo->free_inodes--;
 207                spin_unlock(&sbinfo->stat_lock);
 208        }
 209        return 0;
 210}
 211
 212static void shmem_free_inode(struct super_block *sb)
 213{
 214        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
 215        if (sbinfo->max_inodes) {
 216                spin_lock(&sbinfo->stat_lock);
 217                sbinfo->free_inodes++;
 218                spin_unlock(&sbinfo->stat_lock);
 219        }
 220}
 221
 222/**
 223 * shmem_recalc_inode - recalculate the block usage of an inode
 224 * @inode: inode to recalc
 225 *
 226 * We have to calculate the free blocks since the mm can drop
 227 * undirtied hole pages behind our back.
 228 *
 229 * But normally   info->alloced == inode->i_mapping->nrpages + info->swapped
 230 * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
 231 *
 232 * It has to be called with the spinlock held.
 233 */
 234static void shmem_recalc_inode(struct inode *inode)
 235{
 236        struct shmem_inode_info *info = SHMEM_I(inode);
 237        long freed;
 238
 239        freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
 240        if (freed > 0) {
 241                struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
 242                if (sbinfo->max_blocks)
 243                        percpu_counter_add(&sbinfo->used_blocks, -freed);
 244                info->alloced -= freed;
 245                inode->i_blocks -= freed * BLOCKS_PER_PAGE;
 246                shmem_unacct_blocks(info->flags, freed);
 247        }
 248}
 249
 250/*
 251 * Replace item expected in radix tree by a new item, while holding tree lock.
 252 */
 253static int shmem_radix_tree_replace(struct address_space *mapping,
 254                        pgoff_t index, void *expected, void *replacement)
 255{
 256        void **pslot;
 257        void *item;
 258
 259        VM_BUG_ON(!expected);
 260        VM_BUG_ON(!replacement);
 261        pslot = radix_tree_lookup_slot(&mapping->page_tree, index);
 262        if (!pslot)
 263                return -ENOENT;
 264        item = radix_tree_deref_slot_protected(pslot, &mapping->tree_lock);
 265        if (item != expected)
 266                return -ENOENT;
 267        radix_tree_replace_slot(pslot, replacement);
 268        return 0;
 269}
 270
 271/*
 272 * Sometimes, before we decide whether to proceed or to fail, we must check
 273 * that an entry was not already brought back from swap by a racing thread.
 274 *
 275 * Checking page is not enough: by the time a SwapCache page is locked, it
 276 * might be reused, and again be SwapCache, using the same swap as before.
 277 */
 278static bool shmem_confirm_swap(struct address_space *mapping,
 279                               pgoff_t index, swp_entry_t swap)
 280{
 281        void *item;
 282
 283        rcu_read_lock();
 284        item = radix_tree_lookup(&mapping->page_tree, index);
 285        rcu_read_unlock();
 286        return item == swp_to_radix_entry(swap);
 287}
 288
 289/*
 290 * Like add_to_page_cache_locked, but error if expected item has gone.
 291 */
 292static int shmem_add_to_page_cache(struct page *page,
 293                                   struct address_space *mapping,
 294                                   pgoff_t index, void *expected)
 295{
 296        int error;
 297
 298        VM_BUG_ON_PAGE(!PageLocked(page), page);
 299        VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
 300
 301        page_cache_get(page);
 302        page->mapping = mapping;
 303        page->index = index;
 304
 305        spin_lock_irq(&mapping->tree_lock);
 306        if (!expected)
 307                error = radix_tree_insert(&mapping->page_tree, index, page);
 308        else
 309                error = shmem_radix_tree_replace(mapping, index, expected,
 310                                                                 page);
 311        if (!error) {
 312                mapping->nrpages++;
 313                __inc_zone_page_state(page, NR_FILE_PAGES);
 314                __inc_zone_page_state(page, NR_SHMEM);
 315                spin_unlock_irq(&mapping->tree_lock);
 316        } else {
 317                page->mapping = NULL;
 318                spin_unlock_irq(&mapping->tree_lock);
 319                page_cache_release(page);
 320        }
 321        return error;
 322}
 323
 324/*
 325 * Like delete_from_page_cache, but substitutes swap for page.
 326 */
 327static void shmem_delete_from_page_cache(struct page *page, void *radswap)
 328{
 329        struct address_space *mapping = page->mapping;
 330        int error;
 331
 332        spin_lock_irq(&mapping->tree_lock);
 333        error = shmem_radix_tree_replace(mapping, page->index, page, radswap);
 334        page->mapping = NULL;
 335        mapping->nrpages--;
 336        __dec_zone_page_state(page, NR_FILE_PAGES);
 337        __dec_zone_page_state(page, NR_SHMEM);
 338        spin_unlock_irq(&mapping->tree_lock);
 339        page_cache_release(page);
 340        BUG_ON(error);
 341}
 342
 343/*
 344 * Remove swap entry from radix tree, free the swap and its page cache.
 345 */
 346static int shmem_free_swap(struct address_space *mapping,
 347                           pgoff_t index, void *radswap)
 348{
 349        void *old;
 350
 351        spin_lock_irq(&mapping->tree_lock);
 352        old = radix_tree_delete_item(&mapping->page_tree, index, radswap);
 353        spin_unlock_irq(&mapping->tree_lock);
 354        if (old != radswap)
 355                return -ENOENT;
 356        free_swap_and_cache(radix_to_swp_entry(radswap));
 357        return 0;
 358}
 359
 360/*
 361 * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists.
 362 */
 363void shmem_unlock_mapping(struct address_space *mapping)
 364{
 365        struct pagevec pvec;
 366        pgoff_t indices[PAGEVEC_SIZE];
 367        pgoff_t index = 0;
 368
 369        pagevec_init(&pvec, 0);
 370        /*
 371         * Minor point, but we might as well stop if someone else SHM_LOCKs it.
 372         */
 373        while (!mapping_unevictable(mapping)) {
 374                /*
 375                 * Avoid pagevec_lookup(): find_get_pages() returns 0 as if it
 376                 * has finished, if it hits a row of PAGEVEC_SIZE swap entries.
 377                 */
 378                pvec.nr = find_get_entries(mapping, index,
 379                                           PAGEVEC_SIZE, pvec.pages, indices);
 380                if (!pvec.nr)
 381                        break;
 382                index = indices[pvec.nr - 1] + 1;
 383                pagevec_remove_exceptionals(&pvec);
 384                check_move_unevictable_pages(pvec.pages, pvec.nr);
 385                pagevec_release(&pvec);
 386                cond_resched();
 387        }
 388}
 389
 390/*
 391 * Remove range of pages and swap entries from radix tree, and free them.
 392 * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
 393 */
 394static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
 395                                                                 bool unfalloc)
 396{
 397        struct address_space *mapping = inode->i_mapping;
 398        struct shmem_inode_info *info = SHMEM_I(inode);
 399        pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 400        pgoff_t end = (lend + 1) >> PAGE_CACHE_SHIFT;
 401        unsigned int partial_start = lstart & (PAGE_CACHE_SIZE - 1);
 402        unsigned int partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1);
 403        struct pagevec pvec;
 404        pgoff_t indices[PAGEVEC_SIZE];
 405        long nr_swaps_freed = 0;
 406        pgoff_t index;
 407        int i;
 408
 409        if (lend == -1)
 410                end = -1;       /* unsigned, so actually very big */
 411
 412        pagevec_init(&pvec, 0);
 413        index = start;
 414        while (index < end) {
 415                pvec.nr = find_get_entries(mapping, index,
 416                        min(end - index, (pgoff_t)PAGEVEC_SIZE),
 417                        pvec.pages, indices);
 418                if (!pvec.nr)
 419                        break;
 420                for (i = 0; i < pagevec_count(&pvec); i++) {
 421                        struct page *page = pvec.pages[i];
 422
 423                        index = indices[i];
 424                        if (index >= end)
 425                                break;
 426
 427                        if (radix_tree_exceptional_entry(page)) {
 428                                if (unfalloc)
 429                                        continue;
 430                                nr_swaps_freed += !shmem_free_swap(mapping,
 431                                                                index, page);
 432                                continue;
 433                        }
 434
 435                        if (!trylock_page(page))
 436                                continue;
 437                        if (!unfalloc || !PageUptodate(page)) {
 438                                if (page->mapping == mapping) {
 439                                        VM_BUG_ON_PAGE(PageWriteback(page), page);
 440                                        truncate_inode_page(mapping, page);
 441                                }
 442                        }
 443                        unlock_page(page);
 444                }
 445                pagevec_remove_exceptionals(&pvec);
 446                pagevec_release(&pvec);
 447                cond_resched();
 448                index++;
 449        }
 450
 451        if (partial_start) {
 452                struct page *page = NULL;
 453                shmem_getpage(inode, start - 1, &page, SGP_READ, NULL);
 454                if (page) {
 455                        unsigned int top = PAGE_CACHE_SIZE;
 456                        if (start > end) {
 457                                top = partial_end;
 458                                partial_end = 0;
 459                        }
 460                        zero_user_segment(page, partial_start, top);
 461                        set_page_dirty(page);
 462                        unlock_page(page);
 463                        page_cache_release(page);
 464                }
 465        }
 466        if (partial_end) {
 467                struct page *page = NULL;
 468                shmem_getpage(inode, end, &page, SGP_READ, NULL);
 469                if (page) {
 470                        zero_user_segment(page, 0, partial_end);
 471                        set_page_dirty(page);
 472                        unlock_page(page);
 473                        page_cache_release(page);
 474                }
 475        }
 476        if (start >= end)
 477                return;
 478
 479        index = start;
 480        while (index < end) {
 481                cond_resched();
 482
 483                pvec.nr = find_get_entries(mapping, index,
 484                                min(end - index, (pgoff_t)PAGEVEC_SIZE),
 485                                pvec.pages, indices);
 486                if (!pvec.nr) {
 487                        /* If all gone or hole-punch or unfalloc, we're done */
 488                        if (index == start || end != -1)
 489                                break;
 490                        /* But if truncating, restart to make sure all gone */
 491                        index = start;
 492                        continue;
 493                }
 494                for (i = 0; i < pagevec_count(&pvec); i++) {
 495                        struct page *page = pvec.pages[i];
 496
 497                        index = indices[i];
 498                        if (index >= end)
 499                                break;
 500
 501                        if (radix_tree_exceptional_entry(page)) {
 502                                if (unfalloc)
 503                                        continue;
 504                                if (shmem_free_swap(mapping, index, page)) {
 505                                        /* Swap was replaced by page: retry */
 506                                        index--;
 507                                        break;
 508                                }
 509                                nr_swaps_freed++;
 510                                continue;
 511                        }
 512
 513                        lock_page(page);
 514                        if (!unfalloc || !PageUptodate(page)) {
 515                                if (page->mapping == mapping) {
 516                                        VM_BUG_ON_PAGE(PageWriteback(page), page);
 517                                        truncate_inode_page(mapping, page);
 518                                } else {
 519                                        /* Page was replaced by swap: retry */
 520                                        unlock_page(page);
 521                                        index--;
 522                                        break;
 523                                }
 524                        }
 525                        unlock_page(page);
 526                }
 527                pagevec_remove_exceptionals(&pvec);
 528                pagevec_release(&pvec);
 529                index++;
 530        }
 531
 532        spin_lock(&info->lock);
 533        info->swapped -= nr_swaps_freed;
 534        shmem_recalc_inode(inode);
 535        spin_unlock(&info->lock);
 536}
 537
 538void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
 539{
 540        shmem_undo_range(inode, lstart, lend, false);
 541        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
 542}
 543EXPORT_SYMBOL_GPL(shmem_truncate_range);
 544
 545static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
 546{
 547        struct inode *inode = d_inode(dentry);
 548        struct shmem_inode_info *info = SHMEM_I(inode);
 549        int error;
 550
 551        error = inode_change_ok(inode, attr);
 552        if (error)
 553                return error;
 554
 555        if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
 556                loff_t oldsize = inode->i_size;
 557                loff_t newsize = attr->ia_size;
 558
 559                /* protected by i_mutex */
 560                if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) ||
 561                    (newsize > oldsize && (info->seals & F_SEAL_GROW)))
 562                        return -EPERM;
 563
 564                if (newsize != oldsize) {
 565                        error = shmem_reacct_size(SHMEM_I(inode)->flags,
 566                                        oldsize, newsize);
 567                        if (error)
 568                                return error;
 569                        i_size_write(inode, newsize);
 570                        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
 571                }
 572                if (newsize < oldsize) {
 573                        loff_t holebegin = round_up(newsize, PAGE_SIZE);
 574                        unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
 575                        shmem_truncate_range(inode, newsize, (loff_t)-1);
 576                        /* unmap again to remove racily COWed private pages */
 577                        unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
 578                }
 579        }
 580
 581        setattr_copy(inode, attr);
 582        if (attr->ia_valid & ATTR_MODE)
 583                error = posix_acl_chmod(inode, inode->i_mode);
 584        return error;
 585}
 586
 587static void shmem_evict_inode(struct inode *inode)
 588{
 589        struct shmem_inode_info *info = SHMEM_I(inode);
 590
 591        if (inode->i_mapping->a_ops == &shmem_aops) {
 592                shmem_unacct_size(info->flags, inode->i_size);
 593                inode->i_size = 0;
 594                shmem_truncate_range(inode, 0, (loff_t)-1);
 595                if (!list_empty(&info->swaplist)) {
 596                        mutex_lock(&shmem_swaplist_mutex);
 597                        list_del_init(&info->swaplist);
 598                        mutex_unlock(&shmem_swaplist_mutex);
 599                }
 600        } else
 601                kfree(info->symlink);
 602
 603        simple_xattrs_free(&info->xattrs);
 604        WARN_ON(inode->i_blocks);
 605        shmem_free_inode(inode->i_sb);
 606        clear_inode(inode);
 607}
 608
 609/*
 610 * If swap found in inode, free it and move page from swapcache to filecache.
 611 */
 612static int shmem_unuse_inode(struct shmem_inode_info *info,
 613                             swp_entry_t swap, struct page **pagep)
 614{
 615        struct address_space *mapping = info->vfs_inode.i_mapping;
 616        void *radswap;
 617        pgoff_t index;
 618        gfp_t gfp;
 619        int error = 0;
 620
 621        radswap = swp_to_radix_entry(swap);
 622        index = radix_tree_locate_item(&mapping->page_tree, radswap);
 623        if (index == -1)
 624                return -EAGAIN; /* tell shmem_unuse we found nothing */
 625
 626        /*
 627         * Move _head_ to start search for next from here.
 628         * But be careful: shmem_evict_inode checks list_empty without taking
 629         * mutex, and there's an instant in list_move_tail when info->swaplist
 630         * would appear empty, if it were the only one on shmem_swaplist.
 631         */
 632        if (shmem_swaplist.next != &info->swaplist)
 633                list_move_tail(&shmem_swaplist, &info->swaplist);
 634
 635        gfp = mapping_gfp_mask(mapping);
 636        if (shmem_should_replace_page(*pagep, gfp)) {
 637                mutex_unlock(&shmem_swaplist_mutex);
 638                error = shmem_replace_page(pagep, gfp, info, index);
 639                mutex_lock(&shmem_swaplist_mutex);
 640                /*
 641                 * We needed to drop mutex to make that restrictive page
 642                 * allocation, but the inode might have been freed while we
 643                 * dropped it: although a racing shmem_evict_inode() cannot
 644                 * complete without emptying the radix_tree, our page lock
 645                 * on this swapcache page is not enough to prevent that -
 646                 * free_swap_and_cache() of our swap entry will only
 647                 * trylock_page(), removing swap from radix_tree whatever.
 648                 *
 649                 * We must not proceed to shmem_add_to_page_cache() if the
 650                 * inode has been freed, but of course we cannot rely on
 651                 * inode or mapping or info to check that.  However, we can
 652                 * safely check if our swap entry is still in use (and here
 653                 * it can't have got reused for another page): if it's still
 654                 * in use, then the inode cannot have been freed yet, and we
 655                 * can safely proceed (if it's no longer in use, that tells
 656                 * nothing about the inode, but we don't need to unuse swap).
 657                 */
 658                if (!page_swapcount(*pagep))
 659                        error = -ENOENT;
 660        }
 661
 662        /*
 663         * We rely on shmem_swaplist_mutex, not only to protect the swaplist,
 664         * but also to hold up shmem_evict_inode(): so inode cannot be freed
 665         * beneath us (pagelock doesn't help until the page is in pagecache).
 666         */
 667        if (!error)
 668                error = shmem_add_to_page_cache(*pagep, mapping, index,
 669                                                radswap);
 670        if (error != -ENOMEM) {
 671                /*
 672                 * Truncation and eviction use free_swap_and_cache(), which
 673                 * only does trylock page: if we raced, best clean up here.
 674                 */
 675                delete_from_swap_cache(*pagep);
 676                set_page_dirty(*pagep);
 677                if (!error) {
 678                        spin_lock(&info->lock);
 679                        info->swapped--;
 680                        spin_unlock(&info->lock);
 681                        swap_free(swap);
 682                }
 683        }
 684        return error;
 685}
 686
 687/*
 688 * Search through swapped inodes to find and replace swap by page.
 689 */
 690int shmem_unuse(swp_entry_t swap, struct page *page)
 691{
 692        struct list_head *this, *next;
 693        struct shmem_inode_info *info;
 694        struct mem_cgroup *memcg;
 695        int error = 0;
 696
 697        /*
 698         * There's a faint possibility that swap page was replaced before
 699         * caller locked it: caller will come back later with the right page.
 700         */
 701        if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val))
 702                goto out;
 703
 704        /*
 705         * Charge page using GFP_KERNEL while we can wait, before taking
 706         * the shmem_swaplist_mutex which might hold up shmem_writepage().
 707         * Charged back to the user (not to caller) when swap account is used.
 708         */
 709        error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg);
 710        if (error)
 711                goto out;
 712        /* No radix_tree_preload: swap entry keeps a place for page in tree */
 713        error = -EAGAIN;
 714
 715        mutex_lock(&shmem_swaplist_mutex);
 716        list_for_each_safe(this, next, &shmem_swaplist) {
 717                info = list_entry(this, struct shmem_inode_info, swaplist);
 718                if (info->swapped)
 719                        error = shmem_unuse_inode(info, swap, &page);
 720                else
 721                        list_del_init(&info->swaplist);
 722                cond_resched();
 723                if (error != -EAGAIN)
 724                        break;
 725                /* found nothing in this: move on to search the next */
 726        }
 727        mutex_unlock(&shmem_swaplist_mutex);
 728
 729        if (error) {
 730                if (error != -ENOMEM)
 731                        error = 0;
 732                mem_cgroup_cancel_charge(page, memcg);
 733        } else
 734                mem_cgroup_commit_charge(page, memcg, true);
 735out:
 736        unlock_page(page);
 737        page_cache_release(page);
 738        return error;
 739}
 740
 741/*
 742 * Move the page from the page cache to the swap cache.
 743 */
 744static int shmem_writepage(struct page *page, struct writeback_control *wbc)
 745{
 746        struct shmem_inode_info *info;
 747        struct address_space *mapping;
 748        struct inode *inode;
 749        swp_entry_t swap;
 750        pgoff_t index;
 751
 752        BUG_ON(!PageLocked(page));
 753        mapping = page->mapping;
 754        index = page->index;
 755        inode = mapping->host;
 756        info = SHMEM_I(inode);
 757        if (info->flags & VM_LOCKED)
 758                goto redirty;
 759        if (!total_swap_pages)
 760                goto redirty;
 761
 762        /*
 763         * Our capabilities prevent regular writeback or sync from ever calling
 764         * shmem_writepage; but a stacking filesystem might use ->writepage of
 765         * its underlying filesystem, in which case tmpfs should write out to
 766         * swap only in response to memory pressure, and not for the writeback
 767         * threads or sync.
 768         */
 769        if (!wbc->for_reclaim) {
 770                WARN_ON_ONCE(1);        /* Still happens? Tell us about it! */
 771                goto redirty;
 772        }
 773
 774        /*
 775         * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC
 776         * value into swapfile.c, the only way we can correctly account for a
 777         * fallocated page arriving here is now to initialize it and write it.
 778         *
 779         * That's okay for a page already fallocated earlier, but if we have
 780         * not yet completed the fallocation, then (a) we want to keep track
 781         * of this page in case we have to undo it, and (b) it may not be a
 782         * good idea to continue anyway, once we're pushing into swap.  So
 783         * reactivate the page, and let shmem_fallocate() quit when too many.
 784         */
 785        if (!PageUptodate(page)) {
 786                if (inode->i_private) {
 787                        struct shmem_falloc *shmem_falloc;
 788                        spin_lock(&inode->i_lock);
 789                        shmem_falloc = inode->i_private;
 790                        if (shmem_falloc &&
 791                            !shmem_falloc->waitq &&
 792                            index >= shmem_falloc->start &&
 793                            index < shmem_falloc->next)
 794                                shmem_falloc->nr_unswapped++;
 795                        else
 796                                shmem_falloc = NULL;
 797                        spin_unlock(&inode->i_lock);
 798                        if (shmem_falloc)
 799                                goto redirty;
 800                }
 801                clear_highpage(page);
 802                flush_dcache_page(page);
 803                SetPageUptodate(page);
 804        }
 805
 806        swap = get_swap_page();
 807        if (!swap.val)
 808                goto redirty;
 809
 810        /*
 811         * Add inode to shmem_unuse()'s list of swapped-out inodes,
 812         * if it's not already there.  Do it now before the page is
 813         * moved to swap cache, when its pagelock no longer protects
 814         * the inode from eviction.  But don't unlock the mutex until
 815         * we've incremented swapped, because shmem_unuse_inode() will
 816         * prune a !swapped inode from the swaplist under this mutex.
 817         */
 818        mutex_lock(&shmem_swaplist_mutex);
 819        if (list_empty(&info->swaplist))
 820                list_add_tail(&info->swaplist, &shmem_swaplist);
 821
 822        if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
 823                swap_shmem_alloc(swap);
 824                shmem_delete_from_page_cache(page, swp_to_radix_entry(swap));
 825
 826                spin_lock(&info->lock);
 827                info->swapped++;
 828                shmem_recalc_inode(inode);
 829                spin_unlock(&info->lock);
 830
 831                mutex_unlock(&shmem_swaplist_mutex);
 832                BUG_ON(page_mapped(page));
 833                swap_writepage(page, wbc);
 834                return 0;
 835        }
 836
 837        mutex_unlock(&shmem_swaplist_mutex);
 838        swapcache_free(swap);
 839redirty:
 840        set_page_dirty(page);
 841        if (wbc->for_reclaim)
 842                return AOP_WRITEPAGE_ACTIVATE;  /* Return with page locked */
 843        unlock_page(page);
 844        return 0;
 845}
 846
 847#ifdef CONFIG_NUMA
 848#ifdef CONFIG_TMPFS
 849static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
 850{
 851        char buffer[64];
 852
 853        if (!mpol || mpol->mode == MPOL_DEFAULT)
 854                return;         /* show nothing */
 855
 856        mpol_to_str(buffer, sizeof(buffer), mpol);
 857
 858        seq_printf(seq, ",mpol=%s", buffer);
 859}
 860
 861static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
 862{
 863        struct mempolicy *mpol = NULL;
 864        if (sbinfo->mpol) {
 865                spin_lock(&sbinfo->stat_lock);  /* prevent replace/use races */
 866                mpol = sbinfo->mpol;
 867                mpol_get(mpol);
 868                spin_unlock(&sbinfo->stat_lock);
 869        }
 870        return mpol;
 871}
 872#endif /* CONFIG_TMPFS */
 873
 874static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
 875                        struct shmem_inode_info *info, pgoff_t index)
 876{
 877        struct vm_area_struct pvma;
 878        struct page *page;
 879
 880        /* Create a pseudo vma that just contains the policy */
 881        pvma.vm_start = 0;
 882        /* Bias interleave by inode number to distribute better across nodes */
 883        pvma.vm_pgoff = index + info->vfs_inode.i_ino;
 884        pvma.vm_ops = NULL;
 885        pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
 886
 887        page = swapin_readahead(swap, gfp, &pvma, 0);
 888
 889        /* Drop reference taken by mpol_shared_policy_lookup() */
 890        mpol_cond_put(pvma.vm_policy);
 891
 892        return page;
 893}
 894
 895static struct page *shmem_alloc_page(gfp_t gfp,
 896                        struct shmem_inode_info *info, pgoff_t index)
 897{
 898        struct vm_area_struct pvma;
 899        struct page *page;
 900
 901        /* Create a pseudo vma that just contains the policy */
 902        pvma.vm_start = 0;
 903        /* Bias interleave by inode number to distribute better across nodes */
 904        pvma.vm_pgoff = index + info->vfs_inode.i_ino;
 905        pvma.vm_ops = NULL;
 906        pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
 907
 908        page = alloc_page_vma(gfp, &pvma, 0);
 909
 910        /* Drop reference taken by mpol_shared_policy_lookup() */
 911        mpol_cond_put(pvma.vm_policy);
 912
 913        return page;
 914}
 915#else /* !CONFIG_NUMA */
 916#ifdef CONFIG_TMPFS
 917static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
 918{
 919}
 920#endif /* CONFIG_TMPFS */
 921
 922static inline struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
 923                        struct shmem_inode_info *info, pgoff_t index)
 924{
 925        return swapin_readahead(swap, gfp, NULL, 0);
 926}
 927
 928static inline struct page *shmem_alloc_page(gfp_t gfp,
 929                        struct shmem_inode_info *info, pgoff_t index)
 930{
 931        return alloc_page(gfp);
 932}
 933#endif /* CONFIG_NUMA */
 934
 935#if !defined(CONFIG_NUMA) || !defined(CONFIG_TMPFS)
 936static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
 937{
 938        return NULL;
 939}
 940#endif
 941
 942/*
 943 * When a page is moved from swapcache to shmem filecache (either by the
 944 * usual swapin of shmem_getpage_gfp(), or by the less common swapoff of
 945 * shmem_unuse_inode()), it may have been read in earlier from swap, in
 946 * ignorance of the mapping it belongs to.  If that mapping has special
 947 * constraints (like the gma500 GEM driver, which requires RAM below 4GB),
 948 * we may need to copy to a suitable page before moving to filecache.
 949 *
 950 * In a future release, this may well be extended to respect cpuset and
 951 * NUMA mempolicy, and applied also to anonymous pages in do_swap_page();
 952 * but for now it is a simple matter of zone.
 953 */
 954static bool shmem_should_replace_page(struct page *page, gfp_t gfp)
 955{
 956        return page_zonenum(page) > gfp_zone(gfp);
 957}
 958
 959static int shmem_replace_page(struct page **pagep, gfp_t gfp,
 960                                struct shmem_inode_info *info, pgoff_t index)
 961{
 962        struct page *oldpage, *newpage;
 963        struct address_space *swap_mapping;
 964        pgoff_t swap_index;
 965        int error;
 966
 967        oldpage = *pagep;
 968        swap_index = page_private(oldpage);
 969        swap_mapping = page_mapping(oldpage);
 970
 971        /*
 972         * We have arrived here because our zones are constrained, so don't
 973         * limit chance of success by further cpuset and node constraints.
 974         */
 975        gfp &= ~GFP_CONSTRAINT_MASK;
 976        newpage = shmem_alloc_page(gfp, info, index);
 977        if (!newpage)
 978                return -ENOMEM;
 979
 980        page_cache_get(newpage);
 981        copy_highpage(newpage, oldpage);
 982        flush_dcache_page(newpage);
 983
 984        __set_page_locked(newpage);
 985        SetPageUptodate(newpage);
 986        SetPageSwapBacked(newpage);
 987        set_page_private(newpage, swap_index);
 988        SetPageSwapCache(newpage);
 989
 990        /*
 991         * Our caller will very soon move newpage out of swapcache, but it's
 992         * a nice clean interface for us to replace oldpage by newpage there.
 993         */
 994        spin_lock_irq(&swap_mapping->tree_lock);
 995        error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage,
 996                                                                   newpage);
 997        if (!error) {
 998                __inc_zone_page_state(newpage, NR_FILE_PAGES);
 999                __dec_zone_page_state(oldpage, NR_FILE_PAGES);
1000        }

1001        spin_unlock_irq(&swap_mapping->tree_lock);
1002
1003        if (unlikely(error)) {
1004                /*
1005                 * Is this possible?  I think not, now that our callers check
1006                 * both PageSwapCache and page_private after getting page lock;
1007                 * but be defensive.  Reverse old to newpage for clear and free.
1008                 */
1009                oldpage = newpage;
1010        } else {
1011                mem_cgroup_migrate(oldpage, newpage, true);
1012                lru_cache_add_anon(newpage);
1013                *pagep = newpage;
1014        }
1015
1016        ClearPageSwapCache(oldpage);
1017        set_page_private(oldpage, 0);
1018
1019        unlock_page(oldpage);
1020        page_cache_release(oldpage);
1021        page_cache_release(oldpage);
1022        return error;
1023}
1024
1025/*
1026 * shmem_getpage_gfp - find page in cache, or get from swap, or allocate
1027 *
1028 * If we allocate a new one we do not mark it dirty. That's up to the
1029 * vm. If we swap it in we mark it dirty since we also free the swap
1030 * entry since a page cannot live in both the swap and page cache
1031 */
1032static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
1033        struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type)
1034{
1035        struct address_space *mapping = inode->i_mapping;
1036        struct shmem_inode_info *info;
1037        struct shmem_sb_info *sbinfo;
1038        struct mem_cgroup *memcg;
1039        struct page *page;
1040        swp_entry_t swap;
1041        int error;
1042        int once = 0;
1043        int alloced = 0;
1044
1045        if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT))
1046                return -EFBIG;
1047repeat:
1048        swap.val = 0;
1049        page = find_lock_entry(mapping, index);
1050        if (radix_tree_exceptional_entry(page)) {
1051                swap = radix_to_swp_entry(page);
1052                page = NULL;
1053        }
1054
1055        if (sgp != SGP_WRITE && sgp != SGP_FALLOC &&
1056            ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
1057                error = -EINVAL;
1058                goto failed;
1059        }
1060
1061        if (page && sgp == SGP_WRITE)
1062                mark_page_accessed(page);
1063
1064        /* fallocated page? */
1065        if (page && !PageUptodate(page)) {
1066                if (sgp != SGP_READ)
1067                        goto clear;
1068                unlock_page(page);
1069                page_cache_release(page);
1070                page = NULL;
1071        }
1072        if (page || (sgp == SGP_READ && !swap.val)) {
1073                *pagep = page;
1074                return 0;
1075        }
1076
1077        /*
1078         * Fast cache lookup did not find it:
1079         * bring it back from swap or allocate.
1080         */
1081        info = SHMEM_I(inode);
1082        sbinfo = SHMEM_SB(inode->i_sb);
1083
1084        if (swap.val) {
1085                /* Look it up and read it in.. */
1086                page = lookup_swap_cache(swap);
1087                if (!page) {
1088                        /* here we actually do the io */
1089                        if (fault_type)
1090                                *fault_type |= VM_FAULT_MAJOR;
1091                        page = shmem_swapin(swap, gfp, info, index);
1092                        if (!page) {
1093                                error = -ENOMEM;
1094                                goto failed;
1095                        }
1096                }
1097
1098                /* We have to do this with page locked to prevent races */
1099                lock_page(page);
1100                if (!PageSwapCache(page) || page_private(page) != swap.val ||
1101                    !shmem_confirm_swap(mapping, index, swap)) {
1102                        error = -EEXIST;        /* try again */
1103                        goto unlock;
1104                }
1105                if (!PageUptodate(page)) {
1106                        error = -EIO;
1107                        goto failed;
1108                }
1109                wait_on_page_writeback(page);
1110
1111                if (shmem_should_replace_page(page, gfp)) {
1112                        error = shmem_replace_page(&page, gfp, info, index);
1113                        if (error)
1114                                goto failed;
1115                }
1116
1117                error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg);
1118                if (!error) {
1119                        error = shmem_add_to_page_cache(page, mapping, index,
1120                                                swp_to_radix_entry(swap));
1121                        /*
1122                         * We already confirmed swap under page lock, and make
1123                         * no memory allocation here, so usually no possibility
1124                         * of error; but free_swap_and_cache() only trylocks a
1125                         * page, so it is just possible that the entry has been
1126                         * truncated or holepunched since swap was confirmed.
1127                         * shmem_undo_range() will have done some of the
1128                         * unaccounting, now delete_from_swap_cache() will do
1129                         * the rest.
1130                         * Reset swap.val? No, leave it so "failed" goes back to
1131                         * "repeat": reading a hole and writing should succeed.
1132                         */
1133                        if (error) {
1134                                mem_cgroup_cancel_charge(page, memcg);
1135                                delete_from_swap_cache(page);
1136                        }
1137                }
1138                if (error)
1139                        goto failed;
1140
1141                mem_cgroup_commit_charge(page, memcg, true);
1142
1143                spin_lock(&info->lock);
1144                info->swapped--;
1145                shmem_recalc_inode(inode);
1146                spin_unlock(&info->lock);
1147
1148                if (sgp == SGP_WRITE)
1149                        mark_page_accessed(page);
1150
1151                delete_from_swap_cache(page);
1152                set_page_dirty(page);
1153                swap_free(swap);
1154
1155        } else {
1156                if (shmem_acct_block(info->flags)) {
1157                        error = -ENOSPC;
1158                        goto failed;
1159                }
1160                if (sbinfo->max_blocks) {
1161                        if (percpu_counter_compare(&sbinfo->used_blocks,
1162                                                sbinfo->max_blocks) >= 0) {
1163                                error = -ENOSPC;
1164                                goto unacct;
1165                        }
1166                        percpu_counter_inc(&sbinfo->used_blocks);
1167                }
1168
1169                page = shmem_alloc_page(gfp, info, index);
1170                if (!page) {
1171                        error = -ENOMEM;
1172                        goto decused;
1173                }
1174
1175                __SetPageSwapBacked(page);
1176                __set_page_locked(page);
1177                if (sgp == SGP_WRITE)
1178                        __SetPageReferenced(page);
1179
1180                error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg);
1181                if (error)
1182                        goto decused;
1183                error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK);
1184                if (!error) {
1185                        error = shmem_add_to_page_cache(page, mapping, index,
1186                                                        NULL);
1187                        radix_tree_preload_end();
1188                }
1189                if (error) {
1190                        mem_cgroup_cancel_charge(page, memcg);
1191                        goto decused;
1192                }
1193                mem_cgroup_commit_charge(page, memcg, false);
1194                lru_cache_add_anon(page);
1195
1196                spin_lock(&info->lock);
1197                info->alloced++;
1198                inode->i_blocks += BLOCKS_PER_PAGE;
1199                shmem_recalc_inode(inode);
1200                spin_unlock(&info->lock);
1201                alloced = true;
1202
1203                /*
1204                 * Let SGP_FALLOC use the SGP_WRITE optimization on a new page.
1205                 */
1206                if (sgp == SGP_FALLOC)
1207                        sgp = SGP_WRITE;
1208clear:
1209                /*
1210                 * Let SGP_WRITE caller clear ends if write does not fill page;
1211                 * but SGP_FALLOC on a page fallocated earlier must initialize
1212                 * it now, lest undo on failure cancel our earlier guarantee.
1213                 */
1214                if (sgp != SGP_WRITE) {
1215                        clear_highpage(page);
1216                        flush_dcache_page(page);
1217                        SetPageUptodate(page);
1218                }
1219                if (sgp == SGP_DIRTY)
1220                        set_page_dirty(page);
1221        }
1222
1223        /* Perhaps the file has been truncated since we checked */
1224        if (sgp != SGP_WRITE && sgp != SGP_FALLOC &&
1225            ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
1226                error = -EINVAL;
1227                if (alloced)
1228                        goto trunc;
1229                else
1230                        goto failed;
1231        }
1232        *pagep = page;
1233        return 0;
1234
1235        /*
1236         * Error recovery.
1237         */
1238trunc:
1239        info = SHMEM_I(inode);
1240        ClearPageDirty(page);
1241        delete_from_page_cache(page);
1242        spin_lock(&info->lock);
1243        info->alloced--;
1244        inode->i_blocks -= BLOCKS_PER_PAGE;
1245        spin_unlock(&info->lock);
1246decused:
1247        sbinfo = SHMEM_SB(inode->i_sb);
1248        if (sbinfo->max_blocks)
1249                percpu_counter_add(&sbinfo->used_blocks, -1);
1250unacct:
1251        shmem_unacct_blocks(info->flags, 1);
1252failed:
1253        if (swap.val && error != -EINVAL &&
1254            !shmem_confirm_swap(mapping, index, swap))
1255                error = -EEXIST;
1256unlock:
1257        if (page) {
1258                unlock_page(page);
1259                page_cache_release(page);
1260        }
1261        if (error == -ENOSPC && !once++) {
1262                info = SHMEM_I(inode);
1263                spin_lock(&info->lock);
1264                shmem_recalc_inode(inode);
1265                spin_unlock(&info->lock);
1266                goto repeat;
1267        }
1268        if (error == -EEXIST)   /* from above or from radix_tree_insert */
1269                goto repeat;
1270        return error;
1271}
1272
1273static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1274{
1275        struct inode *inode = file_inode(vma->vm_file);
1276        int error;
1277        int ret = VM_FAULT_LOCKED;
1278
1279        /*
1280         * Trinity finds that probing a hole which tmpfs is punching can
1281         * prevent the hole-punch from ever completing: which in turn
1282         * locks writers out with its hold on i_mutex.  So refrain from
1283         * faulting pages into the hole while it's being punched.  Although
1284         * shmem_undo_range() does remove the additions, it may be unable to
1285         * keep up, as each new page needs its own unmap_mapping_range() call,
1286         * and the i_mmap tree grows ever slower to scan if new vmas are added.
1287         *
1288         * It does not matter if we sometimes reach this check just before the
1289         * hole-punch begins, so that one fault then races with the punch:
1290         * we just need to make racing faults a rare case.
1291         *
1292         * The implementation below would be much simpler if we just used a
1293         * standard mutex or completion: but we cannot take i_mutex in fault,
1294         * and bloating every shmem inode for this unlikely case would be sad.
1295         */
1296        if (unlikely(inode->i_private)) {
1297                struct shmem_falloc *shmem_falloc;
1298
1299                spin_lock(&inode->i_lock);
1300                shmem_falloc = inode->i_private;
1301                if (shmem_falloc &&
1302                    shmem_falloc->waitq &&
1303                    vmf->pgoff >= shmem_falloc->start &&
1304                    vmf->pgoff < shmem_falloc->next) {
1305                        wait_queue_head_t *shmem_falloc_waitq;
1306                        DEFINE_WAIT(shmem_fault_wait);
1307
1308                        ret = VM_FAULT_NOPAGE;
1309                        if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) &&
1310                           !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) {
1311                                /* It's polite to up mmap_sem if we can */
1312                                up_read(&vma->vm_mm->mmap_sem);
1313                                ret = VM_FAULT_RETRY;
1314                        }
1315
1316                        shmem_falloc_waitq = shmem_falloc->waitq;
1317                        prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
1318                                        TASK_UNINTERRUPTIBLE);
1319                        spin_unlock(&inode->i_lock);
1320                        schedule();
1321
1322                        /*
1323                         * shmem_falloc_waitq points into the shmem_fallocate()
1324                         * stack of the hole-punching task: shmem_falloc_waitq
1325                         * is usually invalid by the time we reach here, but
1326                         * finish_wait() does not dereference it in that case;
1327                         * though i_lock needed lest racing with wake_up_all().
1328                         */
1329                        spin_lock(&inode->i_lock);
1330                        finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
1331                        spin_unlock(&inode->i_lock);
1332                        return ret;
1333                }
1334                spin_unlock(&inode->i_lock);
1335        }
1336
1337        error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
1338        if (error)
1339                return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
1340
1341        if (ret & VM_FAULT_MAJOR) {
1342                count_vm_event(PGMAJFAULT);
1343                mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
1344        }
1345        return ret;
1346}
1347
1348#ifdef CONFIG_NUMA
1349static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
1350{
1351        struct inode *inode = file_inode(vma->vm_file);
1352        return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol);
1353}
1354
1355static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
1356                                          unsigned long addr)
1357{
1358        struct inode *inode = file_inode(vma->vm_file);
1359        pgoff_t index;
1360
1361        index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
1362        return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index);
1363}
1364#endif
1365
1366int shmem_lock(struct file *file, int lock, struct user_struct *user)
1367{
1368        struct inode *inode = file_inode(file);
1369        struct shmem_inode_info *info = SHMEM_I(inode);
1370        int retval = -ENOMEM;
1371
1372        spin_lock(&info->lock);
1373        if (lock && !(info->flags & VM_LOCKED)) {
1374                if (!user_shm_lock(inode->i_size, user))
1375                        goto out_nomem;
1376                info->flags |= VM_LOCKED;
1377                mapping_set_unevictable(file->f_mapping);
1378        }
1379        if (!lock && (info->flags & VM_LOCKED) && user) {
1380                user_shm_unlock(inode->i_size, user);
1381                info->flags &= ~VM_LOCKED;
1382                mapping_clear_unevictable(file->f_mapping);
1383        }
1384        retval = 0;
1385
1386out_nomem:
1387        spin_unlock(&info->lock);
1388        return retval;
1389}
1390
1391static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
1392{
1393        file_accessed(file);
1394        vma->vm_ops = &shmem_vm_ops;
1395        return 0;
1396}
1397
1398static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir,
1399                                     umode_t mode, dev_t dev, unsigned long flags)
1400{
1401        struct inode *inode;
1402        struct shmem_inode_info *info;
1403        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
1404
1405        if (shmem_reserve_inode(sb))
1406                return NULL;
1407
1408        inode = new_inode(sb);
1409        if (inode) {
1410                inode->i_ino = get_next_ino();
1411                inode_init_owner(inode, dir, mode);
1412                inode->i_blocks = 0;
1413                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1414                inode->i_generation = get_seconds();
1415                info = SHMEM_I(inode);
1416                memset(info, 0, (char *)inode - (char *)info);
1417                spin_lock_init(&info->lock);
1418                info->seals = F_SEAL_SEAL;
1419                info->flags = flags & VM_NORESERVE;
1420                INIT_LIST_HEAD(&info->swaplist);
1421                simple_xattrs_init(&info->xattrs);
1422                cache_no_acl(inode);
1423
1424                switch (mode & S_IFMT) {
1425                default:
1426                        inode->i_op = &shmem_special_inode_operations;
1427                        init_special_inode(inode, mode, dev);
1428                        break;
1429                case S_IFREG:
1430                        inode->i_mapping->a_ops = &shmem_aops;
1431                        inode->i_op = &shmem_inode_operations;
1432                        inode->i_fop = &shmem_file_operations;
1433                        mpol_shared_policy_init(&info->policy,
1434                                                 shmem_get_sbmpol(sbinfo));
1435                        break;
1436                case S_IFDIR:
1437                        inc_nlink(inode);
1438                        /* Some things misbehave if size == 0 on a directory */
1439                        inode->i_size = 2 * BOGO_DIRENT_SIZE;
1440                        inode->i_op = &shmem_dir_inode_operations;
1441                        inode->i_fop = &simple_dir_operations;
1442                        break;
1443                case S_IFLNK:
1444                        /*
1445                         * Must not load anything in the rbtree,
1446                         * mpol_free_shared_policy will not be called.
1447                         */
1448                        mpol_shared_policy_init(&info->policy, NULL);
1449                        break;
1450                }
1451        } else
1452                shmem_free_inode(sb);
1453        return inode;
1454}
1455
1456bool shmem_mapping(struct address_space *mapping)
1457{
1458        if (!mapping->host)
1459                return false;
1460
1461        return mapping->host->i_sb->s_op == &shmem_ops;
1462}
1463
1464#ifdef CONFIG_TMPFS
1465static const struct inode_operations shmem_symlink_inode_operations;
1466static const struct inode_operations shmem_short_symlink_operations;
1467
1468#ifdef CONFIG_TMPFS_XATTR
1469static int shmem_initxattrs(struct inode *, const struct xattr *, void *);
1470#else
1471#define shmem_initxattrs NULL
1472#endif
1473
1474static int
1475shmem_write_begin(struct file *file, struct address_space *mapping,
1476                        loff_t pos, unsigned len, unsigned flags,
1477                        struct page **pagep, void **fsdata)
1478{
1479        struct inode *inode = mapping->host;
1480        struct shmem_inode_info *info = SHMEM_I(inode);
1481        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1482
1483        /* i_mutex is held by caller */
1484        if (unlikely(info->seals)) {
1485                if (info->seals & F_SEAL_WRITE)
1486                        return -EPERM;
1487                if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size)
1488                        return -EPERM;
1489        }
1490
1491        return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
1492}
1493
1494static int
1495shmem_write_end(struct file *file, struct address_space *mapping,
1496                        loff_t pos, unsigned len, unsigned copied,
1497                        struct page *page, void *fsdata)
1498{
1499        struct inode *inode = mapping->host;
1500
1501        if (pos + copied > inode->i_size)
1502                i_size_write(inode, pos + copied);
1503
1504        if (!PageUptodate(page)) {
1505                if (copied < PAGE_CACHE_SIZE) {
1506                        unsigned from = pos & (PAGE_CACHE_SIZE - 1);
1507                        zero_user_segments(page, 0, from,
1508                                        from + copied, PAGE_CACHE_SIZE);
1509                }
1510                SetPageUptodate(page);
1511        }
1512        set_page_dirty(page);
1513        unlock_page(page);
1514        page_cache_release(page);
1515
1516        return copied;
1517}
1518
1519static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1520{
1521        struct file *file = iocb->ki_filp;
1522        struct inode *inode = file_inode(file);
1523        struct address_space *mapping = inode->i_mapping;
1524        pgoff_t index;
1525        unsigned long offset;
1526        enum sgp_type sgp = SGP_READ;
1527        int error = 0;
1528        ssize_t retval = 0;
1529        loff_t *ppos = &iocb->ki_pos;
1530
1531        /*
1532         * Might this read be for a stacking filesystem?  Then when reading
1533         * holes of a sparse file, we actually need to allocate those pages,
1534         * and even mark them dirty, so it cannot exceed the max_blocks limit.
1535         */
1536        if (!iter_is_iovec(to))
1537                sgp = SGP_DIRTY;
1538
1539        index = *ppos >> PAGE_CACHE_SHIFT;
1540        offset = *ppos & ~PAGE_CACHE_MASK;
1541
1542        for (;;) {
1543                struct page *page = NULL;
1544                pgoff_t end_index;
1545                unsigned long nr, ret;
1546                loff_t i_size = i_size_read(inode);
1547
1548                end_index = i_size >> PAGE_CACHE_SHIFT;
1549                if (index > end_index)
1550                        break;
1551                if (index == end_index) {
1552                        nr = i_size & ~PAGE_CACHE_MASK;
1553                        if (nr <= offset)
1554                                break;
1555                }
1556
1557                error = shmem_getpage(inode, index, &page, sgp, NULL);
1558                if (error) {
1559                        if (error == -EINVAL)
1560                                error = 0;
1561                        break;
1562                }
1563                if (page)
1564                        unlock_page(page);
1565
1566                /*
1567                 * We must evaluate after, since reads (unlike writes)
1568                 * are called without i_mutex protection against truncate
1569                 */
1570                nr = PAGE_CACHE_SIZE;
1571                i_size = i_size_read(inode);
1572                end_index = i_size >> PAGE_CACHE_SHIFT;
1573                if (index == end_index) {
1574                        nr = i_size & ~PAGE_CACHE_MASK;
1575                        if (nr <= offset) {
1576                                if (page)
1577                                        page_cache_release(page);
1578                                break;
1579                        }
1580                }
1581                nr -= offset;
1582
1583                if (page) {
1584                        /*
1585                         * If users can be writing to this page using arbitrary
1586                         * virtual addresses, take care about potential aliasing
1587                         * before reading the page on the kernel side.
1588                         */
1589                        if (mapping_writably_mapped(mapping))
1590                                flush_dcache_page(page);
1591                        /*
1592                         * Mark the page accessed if we read the beginning.
1593                         */
1594                        if (!offset)
1595                                mark_page_accessed(page);
1596                } else {
1597                        page = ZERO_PAGE(0);
1598                        page_cache_get(page);
1599                }
1600
1601                /*
1602                 * Ok, we have the page, and it's up-to-date, so
1603                 * now we can copy it to user space...
1604                 */
1605                ret = copy_page_to_iter(page, offset, nr, to);
1606                retval += ret;
1607                offset += ret;
1608                index += offset >> PAGE_CACHE_SHIFT;
1609                offset &= ~PAGE_CACHE_MASK;
1610
1611                page_cache_release(page);
1612                if (!iov_iter_count(to))
1613                        break;
1614                if (ret < nr) {
1615                        error = -EFAULT;
1616                        break;
1617                }
1618                cond_resched();
1619        }
1620
1621        *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
1622        file_accessed(file);
1623        return retval ? retval : error;
1624}
1625
1626static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
1627                                struct pipe_inode_info *pipe, size_t len,
1628                                unsigned int flags)
1629{
1630        struct address_space *mapping = in->f_mapping;
1631        struct inode *inode = mapping->host;
1632        unsigned int loff, nr_pages, req_pages;
1633        struct page *pages[PIPE_DEF_BUFFERS];
1634        struct partial_page partial[PIPE_DEF_BUFFERS];
1635        struct page *page;
1636        pgoff_t index, end_index;
1637        loff_t isize, left;
1638        int error, page_nr;
1639        struct splice_pipe_desc spd = {
1640                .pages = pages,
1641                .partial = partial,
1642                .nr_pages_max = PIPE_DEF_BUFFERS,
1643                .flags = flags,
1644                .ops = &page_cache_pipe_buf_ops,
1645                .spd_release = spd_release_page,
1646        };
1647
1648        isize = i_size_read(inode);
1649        if (unlikely(*ppos >= isize))
1650                return 0;
1651
1652        left = isize - *ppos;
1653        if (unlikely(left < len))
1654                len = left;
1655
1656        if (splice_grow_spd(pipe, &spd))
1657                return -ENOMEM;
1658
1659        index = *ppos >> PAGE_CACHE_SHIFT;
1660        loff = *ppos & ~PAGE_CACHE_MASK;
1661        req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1662        nr_pages = min(req_pages, spd.nr_pages_max);
1663
1664        spd.nr_pages = find_get_pages_contig(mapping, index,
1665                                                nr_pages, spd.pages);
1666        index += spd.nr_pages;
1667        error = 0;
1668
1669        while (spd.nr_pages < nr_pages) {
1670                error = shmem_getpage(inode, index, &page, SGP_CACHE, NULL);
1671                if (error)
1672                        break;
1673                unlock_page(page);
1674                spd.pages[spd.nr_pages++] = page;
1675                index++;
1676        }
1677
1678        index = *ppos >> PAGE_CACHE_SHIFT;
1679        nr_pages = spd.nr_pages;
1680        spd.nr_pages = 0;
1681
1682        for (page_nr = 0; page_nr < nr_pages; page_nr++) {
1683                unsigned int this_len;
1684
1685                if (!len)
1686                        break;
1687
1688                this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
1689                page = spd.pages[page_nr];
1690
1691                if (!PageUptodate(page) || page->mapping != mapping) {
1692                        error = shmem_getpage(inode, index, &page,
1693                                                        SGP_CACHE, NULL);
1694                        if (error)
1695                                break;
1696                        unlock_page(page);
1697                        page_cache_release(spd.pages[page_nr]);
1698                        spd.pages[page_nr] = page;
1699                }
1700
1701                isize = i_size_read(inode);
1702                end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
1703                if (unlikely(!isize || index > end_index))
1704                        break;
1705
1706                if (end_index == index) {
1707                        unsigned int plen;
1708
1709                        plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
1710                        if (plen <= loff)
1711                                break;
1712
1713                        this_len = min(this_len, plen - loff);
1714                        len = this_len;
1715                }
1716
1717                spd.partial[page_nr].offset = loff;
1718                spd.partial[page_nr].len = this_len;
1719                len -= this_len;
1720                loff = 0;
1721                spd.nr_pages++;
1722                index++;
1723        }
1724
1725        while (page_nr < nr_pages)
1726                page_cache_release(spd.pages[page_nr++]);
1727
1728        if (spd.nr_pages)
1729                error = splice_to_pipe(pipe, &spd);
1730
1731        splice_shrink_spd(&spd);
1732
1733        if (error > 0) {
1734                *ppos += error;
1735                file_accessed(in);
1736        }
1737        return error;
1738}
1739
1740/*
1741 * llseek SEEK_DATA or SEEK_HOLE through the radix_tree.
1742 */
1743static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
1744                                    pgoff_t index, pgoff_t end, int whence)
1745{
1746        struct page *page;
1747        struct pagevec pvec;
1748        pgoff_t indices[PAGEVEC_SIZE];
1749        bool done = false;
1750        int i;
1751
1752        pagevec_init(&pvec, 0);
1753        pvec.nr = 1;            /* start small: we may be there already */
1754        while (!done) {
1755                pvec.nr = find_get_entries(mapping, index,
1756                                        pvec.nr, pvec.pages, indices);
1757                if (!pvec.nr) {
1758                        if (whence == SEEK_DATA)
1759                                index = end;
1760                        break;
1761                }
1762                for (i = 0; i < pvec.nr; i++, index++) {
1763                        if (index < indices[i]) {
1764                                if (whence == SEEK_HOLE) {
1765                                        done = true;
1766                                        break;
1767                                }
1768                                index = indices[i];
1769                        }
1770                        page = pvec.pages[i];
1771                        if (page && !radix_tree_exceptional_entry(page)) {
1772                                if (!PageUptodate(page))
1773                                        page = NULL;
1774                        }
1775                        if (index >= end ||
1776                            (page && whence == SEEK_DATA) ||
1777                            (!page && whence == SEEK_HOLE)) {
1778                                done = true;
1779                                break;
1780                        }
1781                }
1782                pagevec_remove_exceptionals(&pvec);
1783                pagevec_release(&pvec);
1784                pvec.nr = PAGEVEC_SIZE;
1785                cond_resched();
1786        }
1787        return index;
1788}
1789
1790static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
1791{
1792        struct address_space *mapping = file->f_mapping;
1793        struct inode *inode = mapping->host;
1794        pgoff_t start, end;
1795        loff_t new_offset;
1796
1797        if (whence != SEEK_DATA && whence != SEEK_HOLE)
1798                return generic_file_llseek_size(file, offset, whence,
1799                                        MAX_LFS_FILESIZE, i_size_read(inode));
1800        mutex_lock(&inode->i_mutex);
1801        /* We're holding i_mutex so we can access i_size directly */
1802
1803        if (offset < 0)
1804                offset = -EINVAL;
1805        else if (offset >= inode->i_size)
1806                offset = -ENXIO;
1807        else {
1808                start = offset >> PAGE_CACHE_SHIFT;
1809                end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1810                new_offset = shmem_seek_hole_data(mapping, start, end, whence);
1811                new_offset <<= PAGE_CACHE_SHIFT;
1812                if (new_offset > offset) {
1813                        if (new_offset < inode->i_size)
1814                                offset = new_offset;
1815                        else if (whence == SEEK_DATA)
1816                                offset = -ENXIO;
1817                        else
1818                                offset = inode->i_size;
1819                }
1820        }
1821
1822        if (offset >= 0)
1823                offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE);
1824        mutex_unlock(&inode->i_mutex);
1825        return offset;
1826}
1827
1828/*
1829 * We need a tag: a new tag would expand every radix_tree_node by 8 bytes,
1830 * so reuse a tag which we firmly believe is never set or cleared on shmem.
1831 */
1832#define SHMEM_TAG_PINNED        PAGECACHE_TAG_TOWRITE
1833#define LAST_SCAN               4       /* about 150ms max */
1834
1835static void shmem_tag_pins(struct address_space *mapping)
1836{
1837        struct radix_tree_iter iter;
1838        void **slot;
1839        pgoff_t start;
1840        struct page *page;
1841
1842        lru_add_drain();
1843        start = 0;
1844        rcu_read_lock();
1845
1846restart:
1847        radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
1848                page = radix_tree_deref_slot(slot);
1849                if (!page || radix_tree_exception(page)) {
1850                        if (radix_tree_deref_retry(page))
1851                                goto restart;
1852                } else if (page_count(page) - page_mapcount(page) > 1) {
1853                        spin_lock_irq(&mapping->tree_lock);
1854                        radix_tree_tag_set(&mapping->page_tree, iter.index,
1855                                           SHMEM_TAG_PINNED);
1856                        spin_unlock_irq(&mapping->tree_lock);
1857                }
1858
1859                if (need_resched()) {
1860                        cond_resched_rcu();
1861                        start = iter.index + 1;
1862                        goto restart;
1863                }
1864        }
1865        rcu_read_unlock();
1866}
1867
1868/*
1869 * Setting SEAL_WRITE requires us to verify there's no pending writer. However,
1870 * via get_user_pages(), drivers might have some pending I/O without any active
1871 * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages
1872 * and see whether it has an elevated ref-count. If so, we tag them and wait for
1873 * them to be dropped.
1874 * The caller must guarantee that no new user will acquire writable references
1875 * to those pages to avoid races.
1876 */
1877static int shmem_wait_for_pins(struct address_space *mapping)
1878{
1879        struct radix_tree_iter iter;
1880        void **slot;
1881        pgoff_t start;
1882        struct page *page;
1883        int error, scan;
1884
1885        shmem_tag_pins(mapping);
1886
1887        error = 0;
1888        for (scan = 0; scan <= LAST_SCAN; scan++) {
1889                if (!radix_tree_tagged(&mapping->page_tree, SHMEM_TAG_PINNED))
1890                        break;
1891
1892                if (!scan)
1893                        lru_add_drain_all();
1894                else if (schedule_timeout_killable((HZ << scan) / 200))
1895                        scan = LAST_SCAN;
1896
1897                start = 0;
1898                rcu_read_lock();
1899restart:
1900                radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter,
1901                                           start, SHMEM_TAG_PINNED) {
1902
1903                        page = radix_tree_deref_slot(slot);
1904                        if (radix_tree_exception(page)) {
1905                                if (radix_tree_deref_retry(page))
1906                                        goto restart;
1907
1908                                page = NULL;
1909                        }
1910
1911                        if (page &&
1912                            page_count(page) - page_mapcount(page) != 1) {
1913                                if (scan < LAST_SCAN)
1914                                        goto continue_resched;
1915
1916                                /*
1917                                 * On the last scan, we clean up all those tags
1918                                 * we inserted; but make a note that we still
1919                                 * found pages pinned.
1920                                 */
1921                                error = -EBUSY;
1922                        }
1923
1924                        spin_lock_irq(&mapping->tree_lock);
1925                        radix_tree_tag_clear(&mapping->page_tree,
1926                                             iter.index, SHMEM_TAG_PINNED);
1927                        spin_unlock_irq(&mapping->tree_lock);
1928continue_resched:
1929                        if (need_resched()) {
1930                                cond_resched_rcu();
1931                                start = iter.index + 1;
1932                                goto restart;
1933                        }
1934                }
1935                rcu_read_unlock();
1936        }
1937
1938        return error;
1939}
1940
1941#define F_ALL_SEALS (F_SEAL_SEAL | \
1942                     F_SEAL_SHRINK | \
1943                     F_SEAL_GROW | \
1944                     F_SEAL_WRITE)
1945
1946int shmem_add_seals(struct file *file, unsigned int seals)
1947{
1948        struct inode *inode = file_inode(file);
1949        struct shmem_inode_info *info = SHMEM_I(inode);
1950        int error;
1951
1952        /*
1953         * SEALING
1954         * Sealing allows multiple parties to share a shmem-file but restrict
1955         * access to a specific subset of file operations. Seals can only be
1956         * added, but never removed. This way, mutually untrusted parties can
1957         * share common memory regions with a well-defined policy. A malicious
1958         * peer can thus never perform unwanted operations on a shared object.
1959         *
1960         * Seals are only supported on special shmem-files and always affect
1961         * the whole underlying inode. Once a seal is set, it may prevent some
1962         * kinds of access to the file. Currently, the following seals are
1963         * defined:
1964         *   SEAL_SEAL: Prevent further seals from being set on this file
1965         *   SEAL_SHRINK: Prevent the file from shrinking
1966         *   SEAL_GROW: Prevent the file from growing
1967         *   SEAL_WRITE: Prevent write access to the file
1968         *
1969         * As we don't require any trust relationship between two parties, we
1970         * must prevent seals from being removed. Therefore, sealing a file
1971         * only adds a given set of seals to the file, it never touches
1972         * existing seals. Furthermore, the "setting seals"-operation can be
1973         * sealed itself, which basically prevents any further seal from being
1974         * added.
1975         *
1976         * Semantics of sealing are only defined on volatile files. Only
1977         * anonymous shmem files support sealing. More importantly, seals are
1978         * never written to disk. Therefore, there's no plan to support it on
1979         * other file types.
1980         */
1981
1982        if (file->f_op != &shmem_file_operations)
1983                return -EINVAL;
1984        if (!(file->f_mode & FMODE_WRITE))
1985                return -EPERM;
1986        if (seals & ~(unsigned int)F_ALL_SEALS)
1987                return -EINVAL;
1988
1989        mutex_lock(&inode->i_mutex);
1990
1991        if (info->seals & F_SEAL_SEAL) {
1992                error = -EPERM;
1993                goto unlock;
1994        }
1995
1996        if ((seals & F_SEAL_WRITE) && !(info->seals & F_SEAL_WRITE)) {
1997                error = mapping_deny_writable(file->f_mapping);
1998                if (error)
1999                        goto unlock;
2000

2001                error = shmem_wait_for_pins(file->f_mapping);
2002                if (error) {
2003                        mapping_allow_writable(file->f_mapping);
2004                        goto unlock;
2005                }
2006        }
2007
2008        info->seals |= seals;
2009        error = 0;
2010
2011unlock:
2012        mutex_unlock(&inode->i_mutex);
2013        return error;
2014}
2015EXPORT_SYMBOL_GPL(shmem_add_seals);
2016
2017int shmem_get_seals(struct file *file)
2018{
2019        if (file->f_op != &shmem_file_operations)
2020                return -EINVAL;
2021
2022        return SHMEM_I(file_inode(file))->seals;
2023}
2024EXPORT_SYMBOL_GPL(shmem_get_seals);
2025
2026long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
2027{
2028        long error;
2029
2030        switch (cmd) {
2031        case F_ADD_SEALS:
2032                /* disallow upper 32bit */
2033                if (arg > UINT_MAX)
2034                        return -EINVAL;
2035
2036                error = shmem_add_seals(file, arg);
2037                break;
2038        case F_GET_SEALS:
2039                error = shmem_get_seals(file);
2040                break;
2041        default:
2042                error = -EINVAL;
2043                break;
2044        }
2045
2046        return error;
2047}
2048
2049static long shmem_fallocate(struct file *file, int mode, loff_t offset,
2050                                                         loff_t len)
2051{
2052        struct inode *inode = file_inode(file);
2053        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
2054        struct shmem_inode_info *info = SHMEM_I(inode);
2055        struct shmem_falloc shmem_falloc;
2056        pgoff_t start, index, end;
2057        int error;
2058
2059        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
2060                return -EOPNOTSUPP;
2061
2062        mutex_lock(&inode->i_mutex);
2063
2064        if (mode & FALLOC_FL_PUNCH_HOLE) {
2065                struct address_space *mapping = file->f_mapping;
2066                loff_t unmap_start = round_up(offset, PAGE_SIZE);
2067                loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
2068                DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq);
2069
2070                /* protected by i_mutex */
2071                if (info->seals & F_SEAL_WRITE) {
2072                        error = -EPERM;
2073                        goto out;
2074                }
2075
2076                shmem_falloc.waitq = &shmem_falloc_waitq;
2077                shmem_falloc.start = unmap_start >> PAGE_SHIFT;
2078                shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
2079                spin_lock(&inode->i_lock);
2080                inode->i_private = &shmem_falloc;
2081                spin_unlock(&inode->i_lock);
2082
2083                if ((u64)unmap_end > (u64)unmap_start)
2084                        unmap_mapping_range(mapping, unmap_start,
2085                                            1 + unmap_end - unmap_start, 0);
2086                shmem_truncate_range(inode, offset, offset + len - 1);
2087                /* No need to unmap again: hole-punching leaves COWed pages */
2088
2089                spin_lock(&inode->i_lock);
2090                inode->i_private = NULL;
2091                wake_up_all(&shmem_falloc_waitq);
2092                spin_unlock(&inode->i_lock);
2093                error = 0;
2094                goto out;
2095        }
2096
2097        /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
2098        error = inode_newsize_ok(inode, offset + len);
2099        if (error)
2100                goto out;
2101
2102        if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) {
2103                error = -EPERM;
2104                goto out;
2105        }
2106
2107        start = offset >> PAGE_CACHE_SHIFT;
2108        end = (offset + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
2109        /* Try to avoid a swapstorm if len is impossible to satisfy */
2110        if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) {
2111                error = -ENOSPC;
2112                goto out;
2113        }
2114
2115        shmem_falloc.waitq = NULL;
2116        shmem_falloc.start = start;
2117        shmem_falloc.next  = start;
2118        shmem_falloc.nr_falloced = 0;
2119        shmem_falloc.nr_unswapped = 0;
2120        spin_lock(&inode->i_lock);
2121        inode->i_private = &shmem_falloc;
2122        spin_unlock(&inode->i_lock);
2123
2124        for (index = start; index < end; index++) {
2125                struct page *page;
2126
2127                /*
2128                 * Good, the fallocate(2) manpage permits EINTR: we may have
2129                 * been interrupted because we are using up too much memory.
2130                 */
2131                if (signal_pending(current))
2132                        error = -EINTR;
2133                else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
2134                        error = -ENOMEM;
2135                else
2136                        error = shmem_getpage(inode, index, &page, SGP_FALLOC,
2137                                                                        NULL);
2138                if (error) {
2139                        /* Remove the !PageUptodate pages we added */
2140                        shmem_undo_range(inode,
2141                                (loff_t)start << PAGE_CACHE_SHIFT,
2142                                (loff_t)index << PAGE_CACHE_SHIFT, true);
2143                        goto undone;
2144                }
2145
2146                /*
2147                 * Inform shmem_writepage() how far we have reached.
2148                 * No need for lock or barrier: we have the page lock.
2149                 */
2150                shmem_falloc.next++;
2151                if (!PageUptodate(page))
2152                        shmem_falloc.nr_falloced++;
2153
2154                /*
2155                 * If !PageUptodate, leave it that way so that freeable pages
2156                 * can be recognized if we need to rollback on error later.
2157                 * But set_page_dirty so that memory pressure will swap rather
2158                 * than free the pages we are allocating (and SGP_CACHE pages
2159                 * might still be clean: we now need to mark those dirty too).
2160                 */
2161                set_page_dirty(page);
2162                unlock_page(page);
2163                page_cache_release(page);
2164                cond_resched();
2165        }
2166
2167        if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
2168                i_size_write(inode, offset + len);
2169        inode->i_ctime = CURRENT_TIME;
2170undone:
2171        spin_lock(&inode->i_lock);
2172        inode->i_private = NULL;
2173        spin_unlock(&inode->i_lock);
2174out:
2175        mutex_unlock(&inode->i_mutex);
2176        return error;
2177}
2178
2179static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
2180{
2181        struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
2182
2183        buf->f_type = TMPFS_MAGIC;
2184        buf->f_bsize = PAGE_CACHE_SIZE;
2185        buf->f_namelen = NAME_MAX;
2186        if (sbinfo->max_blocks) {
2187                buf->f_blocks = sbinfo->max_blocks;
2188                buf->f_bavail =
2189                buf->f_bfree  = sbinfo->max_blocks -
2190                                percpu_counter_sum(&sbinfo->used_blocks);
2191        }
2192        if (sbinfo->max_inodes) {
2193                buf->f_files = sbinfo->max_inodes;
2194                buf->f_ffree = sbinfo->free_inodes;
2195        }
2196        /* else leave those fields 0 like simple_statfs */
2197        return 0;
2198}
2199
2200/*
2201 * File creation. Allocate an inode, and we're done..
2202 */
2203static int
2204shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
2205{
2206        struct inode *inode;
2207        int error = -ENOSPC;
2208
2209        inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
2210        if (inode) {
2211                error = simple_acl_create(dir, inode);
2212                if (error)
2213                        goto out_iput;
2214                error = security_inode_init_security(inode, dir,
2215                                                     &dentry->d_name,
2216                                                     shmem_initxattrs, NULL);
2217                if (error && error != -EOPNOTSUPP)
2218                        goto out_iput;
2219
2220                error = 0;
2221                dir->i_size += BOGO_DIRENT_SIZE;
2222                dir->i_ctime = dir->i_mtime = CURRENT_TIME;
2223                d_instantiate(dentry, inode);
2224                dget(dentry); /* Extra count - pin the dentry in core */
2225        }
2226        return error;
2227out_iput:
2228        iput(inode);
2229        return error;
2230}
2231
2232static int
2233shmem_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
2234{
2235        struct inode *inode;
2236        int error = -ENOSPC;
2237
2238        inode = shmem_get_inode(dir->i_sb, dir, mode, 0, VM_NORESERVE);
2239        if (inode) {
2240                error = security_inode_init_security(inode, dir,
2241                                                     NULL,
2242                                                     shmem_initxattrs, NULL);
2243                if (error && error != -EOPNOTSUPP)
2244                        goto out_iput;
2245                error = simple_acl_create(dir, inode);
2246                if (error)
2247                        goto out_iput;
2248                d_tmpfile(dentry, inode);
2249        }
2250        return error;
2251out_iput:
2252        iput(inode);
2253        return error;
2254}
2255
2256static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
2257{
2258        int error;
2259
2260        if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0)))
2261                return error;
2262        inc_nlink(dir);
2263        return 0;
2264}
2265
2266static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode,
2267                bool excl)
2268{
2269        return shmem_mknod(dir, dentry, mode | S_IFREG, 0);
2270}
2271
2272/*
2273 * Link a file..
2274 */
2275static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
2276{
2277        struct inode *inode = d_inode(old_dentry);
2278        int ret;
2279
2280        /*
2281         * No ordinary (disk based) filesystem counts links as inodes;
2282         * but each new link needs a new dentry, pinning lowmem, and
2283         * tmpfs dentries cannot be pruned until they are unlinked.
2284         */
2285        ret = shmem_reserve_inode(inode->i_sb);
2286        if (ret)
2287                goto out;
2288
2289        dir->i_size += BOGO_DIRENT_SIZE;
2290        inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
2291        inc_nlink(inode);
2292        ihold(inode);   /* New dentry reference */
2293        dget(dentry);           /* Extra pinning count for the created dentry */
2294        d_instantiate(dentry, inode);
2295out:
2296        return ret;
2297}
2298
2299static int shmem_unlink(struct inode *dir, struct dentry *dentry)
2300{
2301        struct inode *inode = d_inode(dentry);
2302
2303        if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
2304                shmem_free_inode(inode->i_sb);
2305
2306        dir->i_size -= BOGO_DIRENT_SIZE;
2307        inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
2308        drop_nlink(inode);
2309        dput(dentry);   /* Undo the count from "create" - this does all the work */
2310        return 0;
2311}
2312
2313static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
2314{
2315        if (!simple_empty(dentry))
2316                return -ENOTEMPTY;
2317
2318        drop_nlink(d_inode(dentry));
2319        drop_nlink(dir);
2320        return shmem_unlink(dir, dentry);
2321}
2322
2323static int shmem_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry)
2324{
2325        bool old_is_dir = d_is_dir(old_dentry);
2326        bool new_is_dir = d_is_dir(new_dentry);
2327
2328        if (old_dir != new_dir && old_is_dir != new_is_dir) {
2329                if (old_is_dir) {
2330                        drop_nlink(old_dir);
2331                        inc_nlink(new_dir);
2332                } else {
2333                        drop_nlink(new_dir);
2334                        inc_nlink(old_dir);
2335                }
2336        }
2337        old_dir->i_ctime = old_dir->i_mtime =
2338        new_dir->i_ctime = new_dir->i_mtime =
2339        d_inode(old_dentry)->i_ctime =
2340        d_inode(new_dentry)->i_ctime = CURRENT_TIME;
2341
2342        return 0;
2343}
2344
2345static int shmem_whiteout(struct inode *old_dir, struct dentry *old_dentry)
2346{
2347        struct dentry *whiteout;
2348        int error;
2349
2350        whiteout = d_alloc(old_dentry->d_parent, &old_dentry->d_name);
2351        if (!whiteout)
2352                return -ENOMEM;
2353
2354        error = shmem_mknod(old_dir, whiteout,
2355                            S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
2356        dput(whiteout);
2357        if (error)
2358                return error;
2359
2360        /*
2361         * Cheat and hash the whiteout while the old dentry is still in
2362         * place, instead of playing games with FS_RENAME_DOES_D_MOVE.
2363         *
2364         * d_lookup() will consistently find one of them at this point,
2365         * not sure which one, but that isn't even important.
2366         */
2367        d_rehash(whiteout);
2368        return 0;
2369}
2370
2371/*
2372 * The VFS layer already does all the dentry stuff for rename,
2373 * we just have to decrement the usage count for the target if
2374 * it exists so that the VFS layer correctly free's it when it
2375 * gets overwritten.
2376 */
2377static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags)
2378{
2379        struct inode *inode = d_inode(old_dentry);
2380        int they_are_dirs = S_ISDIR(inode->i_mode);
2381
2382        if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
2383                return -EINVAL;
2384
2385        if (flags & RENAME_EXCHANGE)
2386                return shmem_exchange(old_dir, old_dentry, new_dir, new_dentry);
2387
2388        if (!simple_empty(new_dentry))
2389                return -ENOTEMPTY;
2390
2391        if (flags & RENAME_WHITEOUT) {
2392                int error;
2393
2394                error = shmem_whiteout(old_dir, old_dentry);
2395                if (error)
2396                        return error;
2397        }
2398
2399        if (d_really_is_positive(new_dentry)) {
2400                (void) shmem_unlink(new_dir, new_dentry);
2401                if (they_are_dirs) {
2402                        drop_nlink(d_inode(new_dentry));
2403                        drop_nlink(old_dir);
2404                }
2405        } else if (they_are_dirs) {
2406                drop_nlink(old_dir);
2407                inc_nlink(new_dir);
2408        }
2409
2410        old_dir->i_size -= BOGO_DIRENT_SIZE;
2411        new_dir->i_size += BOGO_DIRENT_SIZE;
2412        old_dir->i_ctime = old_dir->i_mtime =
2413        new_dir->i_ctime = new_dir->i_mtime =
2414        inode->i_ctime = CURRENT_TIME;
2415        return 0;
2416}
2417
2418static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
2419{
2420        int error;
2421        int len;
2422        struct inode *inode;
2423        struct page *page;
2424        char *kaddr;
2425        struct shmem_inode_info *info;
2426
2427        len = strlen(symname) + 1;
2428        if (len > PAGE_CACHE_SIZE)
2429                return -ENAMETOOLONG;
2430
2431        inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0, VM_NORESERVE);
2432        if (!inode)
2433                return -ENOSPC;
2434
2435        error = security_inode_init_security(inode, dir, &dentry->d_name,
2436                                             shmem_initxattrs, NULL);
2437        if (error) {
2438                if (error != -EOPNOTSUPP) {
2439                        iput(inode);
2440                        return error;
2441                }
2442                error = 0;
2443        }
2444
2445        info = SHMEM_I(inode);
2446        inode->i_size = len-1;
2447        if (len <= SHORT_SYMLINK_LEN) {
2448                info->symlink = kmemdup(symname, len, GFP_KERNEL);
2449                if (!info->symlink) {
2450                        iput(inode);
2451                        return -ENOMEM;
2452                }
2453                inode->i_op = &shmem_short_symlink_operations;
2454        } else {
2455                error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL);
2456                if (error) {
2457                        iput(inode);
2458                        return error;
2459                }
2460                inode->i_mapping->a_ops = &shmem_aops;
2461                inode->i_op = &shmem_symlink_inode_operations;
2462                kaddr = kmap_atomic(page);
2463                memcpy(kaddr, symname, len);
2464                kunmap_atomic(kaddr);
2465                SetPageUptodate(page);
2466                set_page_dirty(page);
2467                unlock_page(page);
2468                page_cache_release(page);
2469        }
2470        dir->i_size += BOGO_DIRENT_SIZE;
2471        dir->i_ctime = dir->i_mtime = CURRENT_TIME;
2472        d_instantiate(dentry, inode);
2473        dget(dentry);
2474        return 0;
2475}
2476
2477static void *shmem_follow_short_symlink(struct dentry *dentry, struct nameidata *nd)
2478{
2479        nd_set_link(nd, SHMEM_I(d_inode(dentry))->symlink);
2480        return NULL;
2481}
2482
2483static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd)
2484{
2485        struct page *page = NULL;
2486        int error = shmem_getpage(d_inode(dentry), 0, &page, SGP_READ, NULL);
2487        nd_set_link(nd, error ? ERR_PTR(error) : kmap(page));
2488        if (page)
2489                unlock_page(page);
2490        return page;
2491}
2492
2493static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
2494{
2495        if (!IS_ERR(nd_get_link(nd))) {
2496                struct page *page = cookie;
2497                kunmap(page);
2498                mark_page_accessed(page);
2499                page_cache_release(page);
2500        }
2501}
2502
2503#ifdef CONFIG_TMPFS_XATTR
2504/*
2505 * Superblocks without xattr inode operations may get some security.* xattr
2506 * support from the LSM "for free". As soon as we have any other xattrs
2507 * like ACLs, we also need to implement the security.* handlers at
2508 * filesystem level, though.
2509 */
2510
2511/*
2512 * Callback for security_inode_init_security() for acquiring xattrs.
2513 */
2514static int shmem_initxattrs(struct inode *inode,
2515                            const struct xattr *xattr_array,
2516                            void *fs_info)
2517{
2518        struct shmem_inode_info *info = SHMEM_I(inode);
2519        const struct xattr *xattr;
2520        struct simple_xattr *new_xattr;
2521        size_t len;
2522
2523        for (xattr = xattr_array; xattr->name != NULL; xattr++) {
2524                new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len);
2525                if (!new_xattr)
2526                        return -ENOMEM;
2527
2528                len = strlen(xattr->name) + 1;
2529                new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len,
2530                                          GFP_KERNEL);
2531                if (!new_xattr->name) {
2532                        kfree(new_xattr);
2533                        return -ENOMEM;
2534                }
2535
2536                memcpy(new_xattr->name, XATTR_SECURITY_PREFIX,
2537                       XATTR_SECURITY_PREFIX_LEN);
2538                memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN,
2539                       xattr->name, len);
2540
2541                simple_xattr_list_add(&info->xattrs, new_xattr);
2542        }
2543
2544        return 0;
2545}
2546
2547static const struct xattr_handler *shmem_xattr_handlers[] = {
2548#ifdef CONFIG_TMPFS_POSIX_ACL
2549        &posix_acl_access_xattr_handler,
2550        &posix_acl_default_xattr_handler,
2551#endif
2552        NULL
2553};
2554
2555static int shmem_xattr_validate(const char *name)
2556{
2557        struct { const char *prefix; size_t len; } arr[] = {
2558                { XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN },
2559                { XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN }
2560        };
2561        int i;
2562
2563        for (i = 0; i < ARRAY_SIZE(arr); i++) {
2564                size_t preflen = arr[i].len;
2565                if (strncmp(name, arr[i].prefix, preflen) == 0) {
2566                        if (!name[preflen])
2567                                return -EINVAL;
2568                        return 0;
2569                }
2570        }
2571        return -EOPNOTSUPP;
2572}
2573
2574static ssize_t shmem_getxattr(struct dentry *dentry, const char *name,
2575                              void *buffer, size_t size)
2576{
2577        struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
2578        int err;
2579
2580        /*
2581         * If this is a request for a synthetic attribute in the system.*
2582         * namespace use the generic infrastructure to resolve a handler
2583         * for it via sb->s_xattr.
2584         */
2585        if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
2586                return generic_getxattr(dentry, name, buffer, size);
2587
2588        err = shmem_xattr_validate(name);
2589        if (err)
2590                return err;
2591
2592        return simple_xattr_get(&info->xattrs, name, buffer, size);
2593}
2594
2595static int shmem_setxattr(struct dentry *dentry, const char *name,
2596                          const void *value, size_t size, int flags)
2597{
2598        struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
2599        int err;
2600
2601        /*
2602         * If this is a request for a synthetic attribute in the system.*
2603         * namespace use the generic infrastructure to resolve a handler
2604         * for it via sb->s_xattr.
2605         */
2606        if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
2607                return generic_setxattr(dentry, name, value, size, flags);
2608
2609        err = shmem_xattr_validate(name);
2610        if (err)
2611                return err;
2612
2613        return simple_xattr_set(&info->xattrs, name, value, size, flags);
2614}
2615
2616static int shmem_removexattr(struct dentry *dentry, const char *name)
2617{
2618        struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
2619        int err;
2620
2621        /*
2622         * If this is a request for a synthetic attribute in the system.*
2623         * namespace use the generic infrastructure to resolve a handler
2624         * for it via sb->s_xattr.
2625         */
2626        if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
2627                return generic_removexattr(dentry, name);
2628
2629        err = shmem_xattr_validate(name);
2630        if (err)
2631                return err;
2632
2633        return simple_xattr_remove(&info->xattrs, name);
2634}
2635
2636static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
2637{
2638        struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
2639        return simple_xattr_list(&info->xattrs, buffer, size);
2640}
2641#endif /* CONFIG_TMPFS_XATTR */
2642
2643static const struct inode_operations shmem_short_symlink_operations = {
2644        .readlink       = generic_readlink,
2645        .follow_link    = shmem_follow_short_symlink,
2646#ifdef CONFIG_TMPFS_XATTR
2647        .setxattr       = shmem_setxattr,
2648        .getxattr       = shmem_getxattr,
2649        .listxattr      = shmem_listxattr,
2650        .removexattr    = shmem_removexattr,
2651#endif
2652};
2653
2654static const struct inode_operations shmem_symlink_inode_operations = {
2655        .readlink       = generic_readlink,
2656        .follow_link    = shmem_follow_link,
2657        .put_link       = shmem_put_link,
2658#ifdef CONFIG_TMPFS_XATTR
2659        .setxattr       = shmem_setxattr,
2660        .getxattr       = shmem_getxattr,
2661        .listxattr      = shmem_listxattr,
2662        .removexattr    = shmem_removexattr,
2663#endif
2664};
2665
2666static struct dentry *shmem_get_parent(struct dentry *child)
2667{
2668        return ERR_PTR(-ESTALE);
2669}
2670
2671static int shmem_match(struct inode *ino, void *vfh)
2672{
2673        __u32 *fh = vfh;
2674        __u64 inum = fh[2];
2675        inum = (inum << 32) | fh[1];
2676        return ino->i_ino == inum && fh[0] == ino->i_generation;
2677}
2678
2679static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
2680                struct fid *fid, int fh_len, int fh_type)
2681{
2682        struct inode *inode;
2683        struct dentry *dentry = NULL;
2684        u64 inum;
2685
2686        if (fh_len < 3)
2687                return NULL;
2688
2689        inum = fid->raw[2];
2690        inum = (inum << 32) | fid->raw[1];
2691
2692        inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]),
2693                        shmem_match, fid->raw);
2694        if (inode) {
2695                dentry = d_find_alias(inode);
2696                iput(inode);
2697        }
2698
2699        return dentry;
2700}
2701
2702static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len,
2703                                struct inode *parent)
2704{
2705        if (*len < 3) {
2706                *len = 3;
2707                return FILEID_INVALID;
2708        }
2709
2710        if (inode_unhashed(inode)) {
2711                /* Unfortunately insert_inode_hash is not idempotent,
2712                 * so as we hash inodes here rather than at creation
2713                 * time, we need a lock to ensure we only try
2714                 * to do it once
2715                 */
2716                static DEFINE_SPINLOCK(lock);
2717                spin_lock(&lock);
2718                if (inode_unhashed(inode))
2719                        __insert_inode_hash(inode,
2720                                            inode->i_ino + inode->i_generation);
2721                spin_unlock(&lock);
2722        }
2723
2724        fh[0] = inode->i_generation;
2725        fh[1] = inode->i_ino;
2726        fh[2] = ((__u64)inode->i_ino) >> 32;
2727
2728        *len = 3;
2729        return 1;
2730}
2731
2732static const struct export_operations shmem_export_ops = {
2733        .get_parent     = shmem_get_parent,
2734        .encode_fh      = shmem_encode_fh,
2735        .fh_to_dentry   = shmem_fh_to_dentry,
2736};
2737
2738static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
2739                               bool remount)
2740{
2741        char *this_char, *value, *rest;
2742        struct mempolicy *mpol = NULL;
2743        uid_t uid;
2744        gid_t gid;
2745
2746        while (options != NULL) {
2747                this_char = options;
2748                for (;;) {
2749                        /*
2750                         * NUL-terminate this option: unfortunately,
2751                         * mount options form a comma-separated list,
2752                         * but mpol's nodelist may also contain commas.
2753                         */
2754                        options = strchr(options, ',');
2755                        if (options == NULL)
2756                                break;
2757                        options++;
2758                        if (!isdigit(*options)) {
2759                                options[-1] = '\0';
2760                                break;
2761                        }
2762                }
2763                if (!*this_char)
2764                        continue;
2765                if ((value = strchr(this_char,'=')) != NULL) {
2766                        *value++ = 0;
2767                } else {
2768                        printk(KERN_ERR
2769                            "tmpfs: No value for mount option '%s'\n",
2770                            this_char);
2771                        goto error;
2772                }
2773
2774                if (!strcmp(this_char,"size")) {
2775                        unsigned long long size;
2776                        size = memparse(value,&rest);
2777                        if (*rest == '%') {
2778                                size <<= PAGE_SHIFT;
2779                                size *= totalram_pages;
2780                                do_div(size, 100);
2781                                rest++;
2782                        }
2783                        if (*rest)
2784                                goto bad_val;
2785                        sbinfo->max_blocks =
2786                                DIV_ROUND_UP(size, PAGE_CACHE_SIZE);
2787                } else if (!strcmp(this_char,"nr_blocks")) {
2788                        sbinfo->max_blocks = memparse(value, &rest);
2789                        if (*rest)
2790                                goto bad_val;
2791                } else if (!strcmp(this_char,"nr_inodes")) {
2792                        sbinfo->max_inodes = memparse(value, &rest);
2793                        if (*rest)
2794                                goto bad_val;
2795                } else if (!strcmp(this_char,"mode")) {
2796                        if (remount)
2797                                continue;
2798                        sbinfo->mode = simple_strtoul(value, &rest, 8) & 07777;
2799                        if (*rest)
2800                                goto bad_val;
2801                } else if (!strcmp(this_char,"uid")) {
2802                        if (remount)
2803                                continue;
2804                        uid = simple_strtoul(value, &rest, 0);
2805                        if (*rest)
2806                                goto bad_val;
2807                        sbinfo->uid = make_kuid(current_user_ns(), uid);
2808                        if (!uid_valid(sbinfo->uid))
2809                                goto bad_val;
2810                } else if (!strcmp(this_char,"gid")) {
2811                        if (remount)
2812                                continue;
2813                        gid = simple_strtoul(value, &rest, 0);
2814                        if (*rest)
2815                                goto bad_val;
2816                        sbinfo->gid = make_kgid(current_user_ns(), gid);
2817                        if (!gid_valid(sbinfo->gid))
2818                                goto bad_val;
2819                } else if (!strcmp(this_char,"mpol")) {
2820                        mpol_put(mpol);
2821                        mpol = NULL;
2822                        if (mpol_parse_str(value, &mpol))
2823                                goto bad_val;
2824                } else {
2825                        printk(KERN_ERR "tmpfs: Bad mount option %s\n",
2826                               this_char);
2827                        goto error;
2828                }
2829        }
2830        sbinfo->mpol = mpol;
2831        return 0;
2832
2833bad_val:
2834        printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n",
2835               value, this_char);
2836error:
2837        mpol_put(mpol);
2838        return 1;
2839
2840}
2841
2842static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
2843{
2844        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
2845        struct shmem_sb_info config = *sbinfo;
2846        unsigned long inodes;
2847        int error = -EINVAL;
2848
2849        config.mpol = NULL;
2850        if (shmem_parse_options(data, &config, true))
2851                return error;
2852
2853        spin_lock(&sbinfo->stat_lock);
2854        inodes = sbinfo->max_inodes - sbinfo->free_inodes;
2855        if (percpu_counter_compare(&sbinfo->used_blocks, config.max_blocks) > 0)
2856                goto out;
2857        if (config.max_inodes < inodes)
2858                goto out;
2859        /*
2860         * Those tests disallow limited->unlimited while any are in use;
2861         * but we must separately disallow unlimited->limited, because
2862         * in that case we have no record of how much is already in use.
2863         */
2864        if (config.max_blocks && !sbinfo->max_blocks)
2865                goto out;
2866        if (config.max_inodes && !sbinfo->max_inodes)
2867                goto out;
2868
2869        error = 0;
2870        sbinfo->max_blocks  = config.max_blocks;
2871        sbinfo->max_inodes  = config.max_inodes;
2872        sbinfo->free_inodes = config.max_inodes - inodes;
2873
2874        /*
2875         * Preserve previous mempolicy unless mpol remount option was specified.
2876         */
2877        if (config.mpol) {
2878                mpol_put(sbinfo->mpol);
2879                sbinfo->mpol = config.mpol;     /* transfers initial ref */
2880        }
2881out:
2882        spin_unlock(&sbinfo->stat_lock);
2883        return error;
2884}
2885
2886static int shmem_show_options(struct seq_file *seq, struct dentry *root)
2887{
2888        struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb);
2889
2890        if (sbinfo->max_blocks != shmem_default_max_blocks())
2891                seq_printf(seq, ",size=%luk",
2892                        sbinfo->max_blocks << (PAGE_CACHE_SHIFT - 10));
2893        if (sbinfo->max_inodes != shmem_default_max_inodes())
2894                seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes);
2895        if (sbinfo->mode != (S_IRWXUGO | S_ISVTX))
2896                seq_printf(seq, ",mode=%03ho", sbinfo->mode);
2897        if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID))
2898                seq_printf(seq, ",uid=%u",
2899                                from_kuid_munged(&init_user_ns, sbinfo->uid));
2900        if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
2901                seq_printf(seq, ",gid=%u",
2902                                from_kgid_munged(&init_user_ns, sbinfo->gid));
2903        shmem_show_mpol(seq, sbinfo->mpol);
2904        return 0;
2905}
2906
2907#define MFD_NAME_PREFIX "memfd:"
2908#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
2909#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
2910
2911#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING)
2912
2913SYSCALL_DEFINE2(memfd_create,
2914                const char __user *, uname,
2915                unsigned int, flags)
2916{
2917        struct shmem_inode_info *info;
2918        struct file *file;
2919        int fd, error;
2920        char *name;
2921        long len;
2922
2923        if (flags & ~(unsigned int)MFD_ALL_FLAGS)
2924                return -EINVAL;
2925
2926        /* length includes terminating zero */
2927        len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1);
2928        if (len <= 0)
2929                return -EFAULT;
2930        if (len > MFD_NAME_MAX_LEN + 1)
2931                return -EINVAL;
2932
2933        name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_TEMPORARY);
2934        if (!name)
2935                return -ENOMEM;
2936
2937        strcpy(name, MFD_NAME_PREFIX);
2938        if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) {
2939                error = -EFAULT;
2940                goto err_name;
2941        }
2942
2943        /* terminating-zero may have changed after strnlen_user() returned */
2944        if (name[len + MFD_NAME_PREFIX_LEN - 1]) {
2945                error = -EFAULT;
2946                goto err_name;
2947        }
2948
2949        fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0);
2950        if (fd < 0) {
2951                error = fd;
2952                goto err_name;
2953        }
2954
2955        file = shmem_file_setup(name, 0, VM_NORESERVE);
2956        if (IS_ERR(file)) {
2957                error = PTR_ERR(file);
2958                goto err_fd;
2959        }
2960        info = SHMEM_I(file_inode(file));
2961        file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
2962        file->f_flags |= O_RDWR | O_LARGEFILE;
2963        if (flags & MFD_ALLOW_SEALING)
2964                info->seals &= ~F_SEAL_SEAL;
2965
2966        fd_install(fd, file);
2967        kfree(name);
2968        return fd;
2969
2970err_fd:
2971        put_unused_fd(fd);
2972err_name:
2973        kfree(name);
2974        return error;
2975}
2976
2977#endif /* CONFIG_TMPFS */
2978
2979static void shmem_put_super(struct super_block *sb)
2980{
2981        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
2982
2983        percpu_counter_destroy(&sbinfo->used_blocks);
2984        mpol_put(sbinfo->mpol);
2985        kfree(sbinfo);
2986        sb->s_fs_info = NULL;
2987}
2988
2989int shmem_fill_super(struct super_block *sb, void *data, int silent)
2990{
2991        struct inode *inode;
2992        struct shmem_sb_info *sbinfo;
2993        int err = -ENOMEM;
2994
2995        /* Round up to L1_CACHE_BYTES to resist false sharing */
2996        sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info),
2997                                L1_CACHE_BYTES), GFP_KERNEL);
2998        if (!sbinfo)
2999                return -ENOMEM;
3000

3001        sbinfo->mode = S_IRWXUGO | S_ISVTX;
3002        sbinfo->uid = current_fsuid();
3003        sbinfo->gid = current_fsgid();
3004        sb->s_fs_info = sbinfo;
3005
3006#ifdef CONFIG_TMPFS
3007        /*
3008         * Per default we only allow half of the physical ram per
3009         * tmpfs instance, limiting inodes to one per page of lowmem;
3010         * but the internal instance is left unlimited.
3011         */
3012        if (!(sb->s_flags & MS_KERNMOUNT)) {
3013                sbinfo->max_blocks = shmem_default_max_blocks();
3014                sbinfo->max_inodes = shmem_default_max_inodes();
3015                if (shmem_parse_options(data, sbinfo, false)) {
3016                        err = -EINVAL;
3017                        goto failed;
3018                }
3019        } else {
3020                sb->s_flags |= MS_NOUSER;
3021        }
3022        sb->s_export_op = &shmem_export_ops;
3023        sb->s_flags |= MS_NOSEC;
3024#else
3025        sb->s_flags |= MS_NOUSER;
3026#endif
3027
3028        spin_lock_init(&sbinfo->stat_lock);
3029        if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL))
3030                goto failed;
3031        sbinfo->free_inodes = sbinfo->max_inodes;
3032
3033        sb->s_maxbytes = MAX_LFS_FILESIZE;
3034        sb->s_blocksize = PAGE_CACHE_SIZE;
3035        sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
3036        sb->s_magic = TMPFS_MAGIC;
3037        sb->s_op = &shmem_ops;
3038        sb->s_time_gran = 1;
3039#ifdef CONFIG_TMPFS_XATTR
3040        sb->s_xattr = shmem_xattr_handlers;
3041#endif
3042#ifdef CONFIG_TMPFS_POSIX_ACL
3043        sb->s_flags |= MS_POSIXACL;
3044#endif
3045
3046        inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
3047        if (!inode)
3048                goto failed;
3049        inode->i_uid = sbinfo->uid;
3050        inode->i_gid = sbinfo->gid;
3051        sb->s_root = d_make_root(inode);
3052        if (!sb->s_root)
3053                goto failed;
3054        return 0;
3055
3056failed:
3057        shmem_put_super(sb);
3058        return err;
3059}
3060
3061static struct kmem_cache *shmem_inode_cachep;
3062
3063static struct inode *shmem_alloc_inode(struct super_block *sb)
3064{
3065        struct shmem_inode_info *info;
3066        info = kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
3067        if (!info)
3068                return NULL;
3069        return &info->vfs_inode;
3070}
3071
3072static void shmem_destroy_callback(struct rcu_head *head)
3073{
3074        struct inode *inode = container_of(head, struct inode, i_rcu);
3075        kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
3076}
3077
3078static void shmem_destroy_inode(struct inode *inode)
3079{
3080        if (S_ISREG(inode->i_mode))
3081                mpol_free_shared_policy(&SHMEM_I(inode)->policy);
3082        call_rcu(&inode->i_rcu, shmem_destroy_callback);
3083}
3084
3085static void shmem_init_inode(void *foo)
3086{
3087        struct shmem_inode_info *info = foo;
3088        inode_init_once(&info->vfs_inode);
3089}
3090
3091static int shmem_init_inodecache(void)
3092{
3093        shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
3094                                sizeof(struct shmem_inode_info),
3095                                0, SLAB_PANIC, shmem_init_inode);
3096        return 0;
3097}
3098
3099static void shmem_destroy_inodecache(void)
3100{
3101        kmem_cache_destroy(shmem_inode_cachep);
3102}
3103
3104static const struct address_space_operations shmem_aops = {
3105        .writepage      = shmem_writepage,
3106        .set_page_dirty = __set_page_dirty_no_writeback,
3107#ifdef CONFIG_TMPFS
3108        .write_begin    = shmem_write_begin,
3109        .write_end      = shmem_write_end,
3110#endif
3111#ifdef CONFIG_MIGRATION
3112        .migratepage    = migrate_page,
3113#endif
3114        .error_remove_page = generic_error_remove_page,
3115};
3116
3117static const struct file_operations shmem_file_operations = {
3118        .mmap           = shmem_mmap,
3119#ifdef CONFIG_TMPFS
3120        .llseek         = shmem_file_llseek,
3121        .read_iter      = shmem_file_read_iter,
3122        .write_iter     = generic_file_write_iter,
3123        .fsync          = noop_fsync,
3124        .splice_read    = shmem_file_splice_read,
3125        .splice_write   = iter_file_splice_write,
3126        .fallocate      = shmem_fallocate,
3127#endif
3128};
3129
3130static const struct inode_operations shmem_inode_operations = {
3131        .setattr        = shmem_setattr,
3132#ifdef CONFIG_TMPFS_XATTR
3133        .setxattr       = shmem_setxattr,
3134        .getxattr       = shmem_getxattr,
3135        .listxattr      = shmem_listxattr,
3136        .removexattr    = shmem_removexattr,
3137        .set_acl        = simple_set_acl,
3138#endif
3139};
3140
3141static const struct inode_operations shmem_dir_inode_operations = {
3142#ifdef CONFIG_TMPFS
3143        .create         = shmem_create,
3144        .lookup         = simple_lookup,
3145        .link           = shmem_link,
3146        .unlink         = shmem_unlink,
3147        .symlink        = shmem_symlink,
3148        .mkdir          = shmem_mkdir,
3149        .rmdir          = shmem_rmdir,
3150        .mknod          = shmem_mknod,
3151        .rename2        = shmem_rename2,
3152        .tmpfile        = shmem_tmpfile,
3153#endif
3154#ifdef CONFIG_TMPFS_XATTR
3155        .setxattr       = shmem_setxattr,
3156        .getxattr       = shmem_getxattr,
3157        .listxattr      = shmem_listxattr,
3158        .removexattr    = shmem_removexattr,
3159#endif
3160#ifdef CONFIG_TMPFS_POSIX_ACL
3161        .setattr        = shmem_setattr,
3162        .set_acl        = simple_set_acl,
3163#endif
3164};
3165
3166static const struct inode_operations shmem_special_inode_operations = {
3167#ifdef CONFIG_TMPFS_XATTR
3168        .setxattr       = shmem_setxattr,
3169        .getxattr       = shmem_getxattr,
3170        .listxattr      = shmem_listxattr,
3171        .removexattr    = shmem_removexattr,
3172#endif
3173#ifdef CONFIG_TMPFS_POSIX_ACL
3174        .setattr        = shmem_setattr,
3175        .set_acl        = simple_set_acl,
3176#endif
3177};
3178
3179static const struct super_operations shmem_ops = {
3180        .alloc_inode    = shmem_alloc_inode,
3181        .destroy_inode  = shmem_destroy_inode,
3182#ifdef CONFIG_TMPFS
3183        .statfs         = shmem_statfs,
3184        .remount_fs     = shmem_remount_fs,
3185        .show_options   = shmem_show_options,
3186#endif
3187        .evict_inode    = shmem_evict_inode,
3188        .drop_inode     = generic_delete_inode,
3189        .put_super      = shmem_put_super,
3190};
3191
3192static const struct vm_operations_struct shmem_vm_ops = {
3193        .fault          = shmem_fault,
3194        .map_pages      = filemap_map_pages,
3195#ifdef CONFIG_NUMA
3196        .set_policy     = shmem_set_policy,
3197        .get_policy     = shmem_get_policy,
3198#endif
3199};
3200
3201static struct dentry *shmem_mount(struct file_system_type *fs_type,
3202        int flags, const char *dev_name, void *data)
3203{
3204        return mount_nodev(fs_type, flags, data, shmem_fill_super);
3205}
3206
3207static struct file_system_type shmem_fs_type = {
3208        .owner          = THIS_MODULE,
3209        .name           = "tmpfs",
3210        .mount          = shmem_mount,
3211        .kill_sb        = kill_litter_super,
3212        .fs_flags       = FS_USERNS_MOUNT,
3213};
3214
3215int __init shmem_init(void)
3216{
3217        int error;
3218
3219        /* If rootfs called this, don't re-init */
3220        if (shmem_inode_cachep)
3221                return 0;
3222
3223        error = shmem_init_inodecache();
3224        if (error)
3225                goto out3;
3226
3227        error = register_filesystem(&shmem_fs_type);
3228        if (error) {
3229                printk(KERN_ERR "Could not register tmpfs\n");
3230                goto out2;
3231        }
3232
3233        shm_mnt = kern_mount(&shmem_fs_type);
3234        if (IS_ERR(shm_mnt)) {
3235                error = PTR_ERR(shm_mnt);
3236                printk(KERN_ERR "Could not kern_mount tmpfs\n");
3237                goto out1;
3238        }
3239        return 0;
3240
3241out1:
3242        unregister_filesystem(&shmem_fs_type);
3243out2:
3244        shmem_destroy_inodecache();
3245out3:
3246        shm_mnt = ERR_PTR(error);
3247        return error;
3248}
3249
3250#else /* !CONFIG_SHMEM */
3251
3252/*
3253 * tiny-shmem: simple shmemfs and tmpfs using ramfs code
3254 *
3255 * This is intended for small system where the benefits of the full
3256 * shmem code (swap-backed and resource-limited) are outweighed by
3257 * their complexity. On systems without swap this code should be
3258 * effectively equivalent, but much lighter weight.
3259 */
3260
3261static struct file_system_type shmem_fs_type = {
3262        .name           = "tmpfs",
3263        .mount          = ramfs_mount,
3264        .kill_sb        = kill_litter_super,
3265        .fs_flags       = FS_USERNS_MOUNT,
3266};
3267
3268int __init shmem_init(void)
3269{
3270        BUG_ON(register_filesystem(&shmem_fs_type) != 0);
3271
3272        shm_mnt = kern_mount(&shmem_fs_type);
3273        BUG_ON(IS_ERR(shm_mnt));
3274
3275        return 0;
3276}
3277
3278int shmem_unuse(swp_entry_t swap, struct page *page)
3279{
3280        return 0;
3281}
3282
3283int shmem_lock(struct file *file, int lock, struct user_struct *user)
3284{
3285        return 0;
3286}
3287
3288void shmem_unlock_mapping(struct address_space *mapping)
3289{
3290}
3291
3292void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
3293{
3294        truncate_inode_pages_range(inode->i_mapping, lstart, lend);
3295}
3296EXPORT_SYMBOL_GPL(shmem_truncate_range);
3297
3298#define shmem_vm_ops                            generic_file_vm_ops
3299#define shmem_file_operations                   ramfs_file_operations
3300#define shmem_get_inode(sb, dir, mode, dev, flags)      ramfs_get_inode(sb, dir, mode, dev)
3301#define shmem_acct_size(flags, size)            0
3302#define shmem_unacct_size(flags, size)          do {} while (0)
3303
3304#endif /* CONFIG_SHMEM */
3305
3306/* common code */
3307
3308static struct dentry_operations anon_ops = {
3309        .d_dname = simple_dname
3310};
3311
3312static struct file *__shmem_file_setup(const char *name, loff_t size,
3313                                       unsigned long flags, unsigned int i_flags)
3314{
3315        struct file *res;
3316        struct inode *inode;
3317        struct path path;
3318        struct super_block *sb;
3319        struct qstr this;
3320
3321        if (IS_ERR(shm_mnt))
3322                return ERR_CAST(shm_mnt);
3323
3324        if (size < 0 || size > MAX_LFS_FILESIZE)
3325                return ERR_PTR(-EINVAL);
3326
3327        if (shmem_acct_size(flags, size))
3328                return ERR_PTR(-ENOMEM);
3329
3330        res = ERR_PTR(-ENOMEM);
3331        this.name = name;
3332        this.len = strlen(name);
3333        this.hash = 0; /* will go */
3334        sb = shm_mnt->mnt_sb;
3335        path.mnt = mntget(shm_mnt);
3336        path.dentry = d_alloc_pseudo(sb, &this);
3337        if (!path.dentry)
3338                goto put_memory;
3339        d_set_d_op(path.dentry, &anon_ops);
3340
3341        res = ERR_PTR(-ENOSPC);
3342        inode = shmem_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0, flags);
3343        if (!inode)
3344                goto put_memory;
3345
3346        inode->i_flags |= i_flags;
3347        d_instantiate(path.dentry, inode);
3348        inode->i_size = size;
3349        clear_nlink(inode);     /* It is unlinked */
3350        res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size));
3351        if (IS_ERR(res))
3352                goto put_path;
3353
3354        res = alloc_file(&path, FMODE_WRITE | FMODE_READ,
3355                  &shmem_file_operations);
3356        if (IS_ERR(res))
3357                goto put_path;
3358
3359        return res;
3360
3361put_memory:
3362        shmem_unacct_size(flags, size);
3363put_path:
3364        path_put(&path);
3365        return res;
3366}
3367
3368/**
3369 * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be
3370 *      kernel internal.  There will be NO LSM permission checks against the
3371 *      underlying inode.  So users of this interface must do LSM checks at a
3372 *      higher layer.  The one user is the big_key implementation.  LSM checks
3373 *      are provided at the key level rather than the inode level.
3374 * @name: name for dentry (to be seen in /proc/<pid>/maps
3375 * @size: size to be set for the file
3376 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
3377 */
3378struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags)
3379{
3380        return __shmem_file_setup(name, size, flags, S_PRIVATE);
3381}
3382
3383/**
3384 * shmem_file_setup - get an unlinked file living in tmpfs
3385 * @name: name for dentry (to be seen in /proc/<pid>/maps
3386 * @size: size to be set for the file
3387 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
3388 */
3389struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags)
3390{
3391        return __shmem_file_setup(name, size, flags, 0);
3392}
3393EXPORT_SYMBOL_GPL(shmem_file_setup);
3394
3395/**
3396 * shmem_zero_setup - setup a shared anonymous mapping
3397 * @vma: the vma to be mmapped is prepared by do_mmap_pgoff
3398 */
3399int shmem_zero_setup(struct vm_area_struct *vma)
3400{
3401        struct file *file;
3402        loff_t size = vma->vm_end - vma->vm_start;
3403
3404        /*
3405         * Cloning a new file under mmap_sem leads to a lock ordering conflict
3406         * between XFS directory reading and selinux: since this file is only
3407         * accessible to the user through its mapping, use S_PRIVATE flag to
3408         * bypass file security, in the same way as shmem_kernel_file_setup().
3409         */
3410        file = __shmem_file_setup("dev/zero", size, vma->vm_flags, S_PRIVATE);
3411        if (IS_ERR(file))
3412                return PTR_ERR(file);
3413
3414        if (vma->vm_file)
3415                fput(vma->vm_file);
3416        vma->vm_file = file;
3417        vma->vm_ops = &shmem_vm_ops;
3418        return 0;
3419}
3420
3421/**
3422 * shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags.
3423 * @mapping:    the page's address_space
3424 * @index:      the page index
3425 * @gfp:        the page allocator flags to use if allocating
3426 *
3427 * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)",
3428 * with any new page allocations done using the specified allocation flags.
3429 * But read_cache_page_gfp() uses the ->readpage() method: which does not
3430 * suit tmpfs, since it may have pages in swapcache, and needs to find those
3431 * for itself; although drivers/gpu/drm i915 and ttm rely upon this support.
3432 *
3433 * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in
3434 * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily.
3435 */
3436struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
3437                                         pgoff_t index, gfp_t gfp)
3438{
3439#ifdef CONFIG_SHMEM
3440        struct inode *inode = mapping->host;
3441        struct page *page;
3442        int error;
3443
3444        BUG_ON(mapping->a_ops != &shmem_aops);
3445        error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp, NULL);
3446        if (error)
3447                page = ERR_PTR(error);
3448        else
3449                unlock_page(page);
3450        return page;
3451#else
3452        /*
3453         * The tiny !SHMEM case uses ramfs without swap
3454         */
3455        return read_cache_page_gfp(mapping, index, gfp);
3456#endif
3457}
3458EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);
3459