linux/mm/shmem.c
<<
>>
Prefs
   1/*
   2 * Resizable virtual memory filesystem for Linux.
   3 *
   4 * Copyright (C) 2000 Linus Torvalds.
   5 *               2000 Transmeta Corp.
   6 *               2000-2001 Christoph Rohland
   7 *               2000-2001 SAP AG
   8 *               2002 Red Hat Inc.
   9 * Copyright (C) 2002-2011 Hugh Dickins.
  10 * Copyright (C) 2011 Google Inc.
  11 * Copyright (C) 2002-2005 VERITAS Software Corporation.
  12 * Copyright (C) 2004 Andi Kleen, SuSE Labs
  13 *
  14 * Extended attribute support for tmpfs:
  15 * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net>
  16 * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
  17 *
  18 * tiny-shmem:
  19 * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com>
  20 *
  21 * This file is released under the GPL.
  22 */
  23
  24#include <linux/fs.h>
  25#include <linux/init.h>
  26#include <linux/vfs.h>
  27#include <linux/mount.h>
  28#include <linux/ramfs.h>
  29#include <linux/pagemap.h>
  30#include <linux/file.h>
  31#include <linux/mm.h>
  32#include <linux/random.h>
  33#include <linux/sched/signal.h>
  34#include <linux/export.h>
  35#include <linux/swap.h>
  36#include <linux/uio.h>
  37#include <linux/khugepaged.h>
  38#include <linux/hugetlb.h>
  39#include <linux/frontswap.h>
  40#include <linux/fs_parser.h>
  41
  42#include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */
  43
  44static struct vfsmount *shm_mnt;
  45
  46#ifdef CONFIG_SHMEM
  47/*
  48 * This virtual memory filesystem is heavily based on the ramfs. It
  49 * extends ramfs by the ability to use swap and honor resource limits
  50 * which makes it a completely usable filesystem.
  51 */
  52
  53#include <linux/xattr.h>
  54#include <linux/exportfs.h>
  55#include <linux/posix_acl.h>
  56#include <linux/posix_acl_xattr.h>
  57#include <linux/mman.h>
  58#include <linux/string.h>
  59#include <linux/slab.h>
  60#include <linux/backing-dev.h>
  61#include <linux/shmem_fs.h>
  62#include <linux/writeback.h>
  63#include <linux/blkdev.h>
  64#include <linux/pagevec.h>
  65#include <linux/percpu_counter.h>
  66#include <linux/falloc.h>
  67#include <linux/splice.h>
  68#include <linux/security.h>
  69#include <linux/swapops.h>
  70#include <linux/mempolicy.h>
  71#include <linux/namei.h>
  72#include <linux/ctype.h>
  73#include <linux/migrate.h>
  74#include <linux/highmem.h>
  75#include <linux/seq_file.h>
  76#include <linux/magic.h>
  77#include <linux/syscalls.h>
  78#include <linux/fcntl.h>
  79#include <uapi/linux/memfd.h>
  80#include <linux/userfaultfd_k.h>
  81#include <linux/rmap.h>
  82#include <linux/uuid.h>
  83
  84#include <linux/uaccess.h>
  85#include <asm/pgtable.h>
  86
  87#include "internal.h"
  88
  89#define BLOCKS_PER_PAGE  (PAGE_SIZE/512)
  90#define VM_ACCT(size)    (PAGE_ALIGN(size) >> PAGE_SHIFT)
  91
  92/* Pretend that each entry is of this size in directory's i_size */
  93#define BOGO_DIRENT_SIZE 20
  94
  95/* Symlink up to this size is kmalloc'ed instead of using a swappable page */
  96#define SHORT_SYMLINK_LEN 128
  97
  98/*
  99 * shmem_fallocate communicates with shmem_fault or shmem_writepage via
 100 * inode->i_private (with i_mutex making sure that it has only one user at
 101 * a time): we would prefer not to enlarge the shmem inode just for that.
 102 */
 103struct shmem_falloc {
 104        wait_queue_head_t *waitq; /* faults into hole wait for punch to end */
 105        pgoff_t start;          /* start of range currently being fallocated */
 106        pgoff_t next;           /* the next page offset to be fallocated */
 107        pgoff_t nr_falloced;    /* how many new pages have been fallocated */
 108        pgoff_t nr_unswapped;   /* how often writepage refused to swap out */
 109};
 110
 111struct shmem_options {
 112        unsigned long long blocks;
 113        unsigned long long inodes;
 114        struct mempolicy *mpol;
 115        kuid_t uid;
 116        kgid_t gid;
 117        umode_t mode;
 118        int huge;
 119        int seen;
 120#define SHMEM_SEEN_BLOCKS 1
 121#define SHMEM_SEEN_INODES 2
 122#define SHMEM_SEEN_HUGE 4
 123};
 124
 125#ifdef CONFIG_TMPFS
 126static unsigned long shmem_default_max_blocks(void)
 127{
 128        return totalram_pages() / 2;
 129}
 130
 131static unsigned long shmem_default_max_inodes(void)
 132{
 133        unsigned long nr_pages = totalram_pages();
 134
 135        return min(nr_pages - totalhigh_pages(), nr_pages / 2);
 136}
 137#endif
 138
 139static bool shmem_should_replace_page(struct page *page, gfp_t gfp);
 140static int shmem_replace_page(struct page **pagep, gfp_t gfp,
 141                                struct shmem_inode_info *info, pgoff_t index);
 142static int shmem_swapin_page(struct inode *inode, pgoff_t index,
 143                             struct page **pagep, enum sgp_type sgp,
 144                             gfp_t gfp, struct vm_area_struct *vma,
 145                             vm_fault_t *fault_type);
 146static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
 147                struct page **pagep, enum sgp_type sgp,
 148                gfp_t gfp, struct vm_area_struct *vma,
 149                struct vm_fault *vmf, vm_fault_t *fault_type);
 150
 151int shmem_getpage(struct inode *inode, pgoff_t index,
 152                struct page **pagep, enum sgp_type sgp)
 153{
 154        return shmem_getpage_gfp(inode, index, pagep, sgp,
 155                mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL);
 156}
 157
 158static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
 159{
 160        return sb->s_fs_info;
 161}
 162
 163/*
 164 * shmem_file_setup pre-accounts the whole fixed size of a VM object,
 165 * for shared memory and for shared anonymous (/dev/zero) mappings
 166 * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1),
 167 * consistent with the pre-accounting of private mappings ...
 168 */
 169static inline int shmem_acct_size(unsigned long flags, loff_t size)
 170{
 171        return (flags & VM_NORESERVE) ?
 172                0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size));
 173}
 174
 175static inline void shmem_unacct_size(unsigned long flags, loff_t size)
 176{
 177        if (!(flags & VM_NORESERVE))
 178                vm_unacct_memory(VM_ACCT(size));
 179}
 180
 181static inline int shmem_reacct_size(unsigned long flags,
 182                loff_t oldsize, loff_t newsize)
 183{
 184        if (!(flags & VM_NORESERVE)) {
 185                if (VM_ACCT(newsize) > VM_ACCT(oldsize))
 186                        return security_vm_enough_memory_mm(current->mm,
 187                                        VM_ACCT(newsize) - VM_ACCT(oldsize));
 188                else if (VM_ACCT(newsize) < VM_ACCT(oldsize))
 189                        vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize));
 190        }
 191        return 0;
 192}
 193
 194/*
 195 * ... whereas tmpfs objects are accounted incrementally as
 196 * pages are allocated, in order to allow large sparse files.
 197 * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
 198 * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
 199 */
 200static inline int shmem_acct_block(unsigned long flags, long pages)
 201{
 202        if (!(flags & VM_NORESERVE))
 203                return 0;
 204
 205        return security_vm_enough_memory_mm(current->mm,
 206                        pages * VM_ACCT(PAGE_SIZE));
 207}
 208
 209static inline void shmem_unacct_blocks(unsigned long flags, long pages)
 210{
 211        if (flags & VM_NORESERVE)
 212                vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE));
 213}
 214
 215static inline bool shmem_inode_acct_block(struct inode *inode, long pages)
 216{
 217        struct shmem_inode_info *info = SHMEM_I(inode);
 218        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
 219
 220        if (shmem_acct_block(info->flags, pages))
 221                return false;
 222
 223        if (sbinfo->max_blocks) {
 224                if (percpu_counter_compare(&sbinfo->used_blocks,
 225                                           sbinfo->max_blocks - pages) > 0)
 226                        goto unacct;
 227                percpu_counter_add(&sbinfo->used_blocks, pages);
 228        }
 229
 230        return true;
 231
 232unacct:
 233        shmem_unacct_blocks(info->flags, pages);
 234        return false;
 235}
 236
 237static inline void shmem_inode_unacct_blocks(struct inode *inode, long pages)
 238{
 239        struct shmem_inode_info *info = SHMEM_I(inode);
 240        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
 241
 242        if (sbinfo->max_blocks)
 243                percpu_counter_sub(&sbinfo->used_blocks, pages);
 244        shmem_unacct_blocks(info->flags, pages);
 245}
 246
 247static const struct super_operations shmem_ops;
 248static const struct address_space_operations shmem_aops;
 249static const struct file_operations shmem_file_operations;
 250static const struct inode_operations shmem_inode_operations;
 251static const struct inode_operations shmem_dir_inode_operations;
 252static const struct inode_operations shmem_special_inode_operations;
 253static const struct vm_operations_struct shmem_vm_ops;
 254static struct file_system_type shmem_fs_type;
 255
 256bool vma_is_shmem(struct vm_area_struct *vma)
 257{
 258        return vma->vm_ops == &shmem_vm_ops;
 259}
 260
 261static LIST_HEAD(shmem_swaplist);
 262static DEFINE_MUTEX(shmem_swaplist_mutex);
 263
 264static int shmem_reserve_inode(struct super_block *sb)
 265{
 266        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
 267        if (sbinfo->max_inodes) {
 268                spin_lock(&sbinfo->stat_lock);
 269                if (!sbinfo->free_inodes) {
 270                        spin_unlock(&sbinfo->stat_lock);
 271                        return -ENOSPC;
 272                }
 273                sbinfo->free_inodes--;
 274                spin_unlock(&sbinfo->stat_lock);
 275        }
 276        return 0;
 277}
 278
 279static void shmem_free_inode(struct super_block *sb)
 280{
 281        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
 282        if (sbinfo->max_inodes) {
 283                spin_lock(&sbinfo->stat_lock);
 284                sbinfo->free_inodes++;
 285                spin_unlock(&sbinfo->stat_lock);
 286        }
 287}
 288
 289/**
 290 * shmem_recalc_inode - recalculate the block usage of an inode
 291 * @inode: inode to recalc
 292 *
 293 * We have to calculate the free blocks since the mm can drop
 294 * undirtied hole pages behind our back.
 295 *
 296 * But normally   info->alloced == inode->i_mapping->nrpages + info->swapped
 297 * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
 298 *
 299 * It has to be called with the spinlock held.
 300 */
 301static void shmem_recalc_inode(struct inode *inode)
 302{
 303        struct shmem_inode_info *info = SHMEM_I(inode);
 304        long freed;
 305
 306        freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
 307        if (freed > 0) {
 308                info->alloced -= freed;
 309                inode->i_blocks -= freed * BLOCKS_PER_PAGE;
 310                shmem_inode_unacct_blocks(inode, freed);
 311        }
 312}
 313
 314bool shmem_charge(struct inode *inode, long pages)
 315{
 316        struct shmem_inode_info *info = SHMEM_I(inode);
 317        unsigned long flags;
 318
 319        if (!shmem_inode_acct_block(inode, pages))
 320                return false;
 321
 322        /* nrpages adjustment first, then shmem_recalc_inode() when balanced */
 323        inode->i_mapping->nrpages += pages;
 324
 325        spin_lock_irqsave(&info->lock, flags);
 326        info->alloced += pages;
 327        inode->i_blocks += pages * BLOCKS_PER_PAGE;
 328        shmem_recalc_inode(inode);
 329        spin_unlock_irqrestore(&info->lock, flags);
 330
 331        return true;
 332}
 333
 334void shmem_uncharge(struct inode *inode, long pages)
 335{
 336        struct shmem_inode_info *info = SHMEM_I(inode);
 337        unsigned long flags;
 338
 339        /* nrpages adjustment done by __delete_from_page_cache() or caller */
 340
 341        spin_lock_irqsave(&info->lock, flags);
 342        info->alloced -= pages;
 343        inode->i_blocks -= pages * BLOCKS_PER_PAGE;
 344        shmem_recalc_inode(inode);
 345        spin_unlock_irqrestore(&info->lock, flags);
 346
 347        shmem_inode_unacct_blocks(inode, pages);
 348}
 349
 350/*
 351 * Replace item expected in xarray by a new item, while holding xa_lock.
 352 */
 353static int shmem_replace_entry(struct address_space *mapping,
 354                        pgoff_t index, void *expected, void *replacement)
 355{
 356        XA_STATE(xas, &mapping->i_pages, index);
 357        void *item;
 358
 359        VM_BUG_ON(!expected);
 360        VM_BUG_ON(!replacement);
 361        item = xas_load(&xas);
 362        if (item != expected)
 363                return -ENOENT;
 364        xas_store(&xas, replacement);
 365        return 0;
 366}
 367
 368/*
 369 * Sometimes, before we decide whether to proceed or to fail, we must check
 370 * that an entry was not already brought back from swap by a racing thread.
 371 *
 372 * Checking page is not enough: by the time a SwapCache page is locked, it
 373 * might be reused, and again be SwapCache, using the same swap as before.
 374 */
 375static bool shmem_confirm_swap(struct address_space *mapping,
 376                               pgoff_t index, swp_entry_t swap)
 377{
 378        return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap);
 379}
 380
 381/*
 382 * Definitions for "huge tmpfs": tmpfs mounted with the huge= option
 383 *
 384 * SHMEM_HUGE_NEVER:
 385 *      disables huge pages for the mount;
 386 * SHMEM_HUGE_ALWAYS:
 387 *      enables huge pages for the mount;
 388 * SHMEM_HUGE_WITHIN_SIZE:
 389 *      only allocate huge pages if the page will be fully within i_size,
 390 *      also respect fadvise()/madvise() hints;
 391 * SHMEM_HUGE_ADVISE:
 392 *      only allocate huge pages if requested with fadvise()/madvise();
 393 */
 394
 395#define SHMEM_HUGE_NEVER        0
 396#define SHMEM_HUGE_ALWAYS       1
 397#define SHMEM_HUGE_WITHIN_SIZE  2
 398#define SHMEM_HUGE_ADVISE       3
 399
 400/*
 401 * Special values.
 402 * Only can be set via /sys/kernel/mm/transparent_hugepage/shmem_enabled:
 403 *
 404 * SHMEM_HUGE_DENY:
 405 *      disables huge on shm_mnt and all mounts, for emergency use;
 406 * SHMEM_HUGE_FORCE:
 407 *      enables huge on shm_mnt and all mounts, w/o needing option, for testing;
 408 *
 409 */
 410#define SHMEM_HUGE_DENY         (-1)
 411#define SHMEM_HUGE_FORCE        (-2)
 412
 413#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
 414/* ifdef here to avoid bloating shmem.o when not necessary */
 415
 416static int shmem_huge __read_mostly;
 417
 418#if defined(CONFIG_SYSFS)
 419static int shmem_parse_huge(const char *str)
 420{
 421        if (!strcmp(str, "never"))
 422                return SHMEM_HUGE_NEVER;
 423        if (!strcmp(str, "always"))
 424                return SHMEM_HUGE_ALWAYS;
 425        if (!strcmp(str, "within_size"))
 426                return SHMEM_HUGE_WITHIN_SIZE;
 427        if (!strcmp(str, "advise"))
 428                return SHMEM_HUGE_ADVISE;
 429        if (!strcmp(str, "deny"))
 430                return SHMEM_HUGE_DENY;
 431        if (!strcmp(str, "force"))
 432                return SHMEM_HUGE_FORCE;
 433        return -EINVAL;
 434}
 435#endif
 436
 437#if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS)
 438static const char *shmem_format_huge(int huge)
 439{
 440        switch (huge) {
 441        case SHMEM_HUGE_NEVER:
 442                return "never";
 443        case SHMEM_HUGE_ALWAYS:
 444                return "always";
 445        case SHMEM_HUGE_WITHIN_SIZE:
 446                return "within_size";
 447        case SHMEM_HUGE_ADVISE:
 448                return "advise";
 449        case SHMEM_HUGE_DENY:
 450                return "deny";
 451        case SHMEM_HUGE_FORCE:
 452                return "force";
 453        default:
 454                VM_BUG_ON(1);
 455                return "bad_val";
 456        }
 457}
 458#endif
 459
 460static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
 461                struct shrink_control *sc, unsigned long nr_to_split)
 462{
 463        LIST_HEAD(list), *pos, *next;
 464        LIST_HEAD(to_remove);
 465        struct inode *inode;
 466        struct shmem_inode_info *info;
 467        struct page *page;
 468        unsigned long batch = sc ? sc->nr_to_scan : 128;
 469        int removed = 0, split = 0;
 470
 471        if (list_empty(&sbinfo->shrinklist))
 472                return SHRINK_STOP;
 473
 474        spin_lock(&sbinfo->shrinklist_lock);
 475        list_for_each_safe(pos, next, &sbinfo->shrinklist) {
 476                info = list_entry(pos, struct shmem_inode_info, shrinklist);
 477
 478                /* pin the inode */
 479                inode = igrab(&info->vfs_inode);
 480
 481                /* inode is about to be evicted */
 482                if (!inode) {
 483                        list_del_init(&info->shrinklist);
 484                        removed++;
 485                        goto next;
 486                }
 487
 488                /* Check if there's anything to gain */
 489                if (round_up(inode->i_size, PAGE_SIZE) ==
 490                                round_up(inode->i_size, HPAGE_PMD_SIZE)) {
 491                        list_move(&info->shrinklist, &to_remove);
 492                        removed++;
 493                        goto next;
 494                }
 495
 496                list_move(&info->shrinklist, &list);
 497next:
 498                if (!--batch)
 499                        break;
 500        }
 501        spin_unlock(&sbinfo->shrinklist_lock);
 502
 503        list_for_each_safe(pos, next, &to_remove) {
 504                info = list_entry(pos, struct shmem_inode_info, shrinklist);
 505                inode = &info->vfs_inode;
 506                list_del_init(&info->shrinklist);
 507                iput(inode);
 508        }
 509
 510        list_for_each_safe(pos, next, &list) {
 511                int ret;
 512
 513                info = list_entry(pos, struct shmem_inode_info, shrinklist);
 514                inode = &info->vfs_inode;
 515
 516                if (nr_to_split && split >= nr_to_split)
 517                        goto leave;
 518
 519                page = find_get_page(inode->i_mapping,
 520                                (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT);
 521                if (!page)
 522                        goto drop;
 523
 524                /* No huge page at the end of the file: nothing to split */
 525                if (!PageTransHuge(page)) {
 526                        put_page(page);
 527                        goto drop;
 528                }
 529
 530                /*
 531                 * Leave the inode on the list if we failed to lock
 532                 * the page at this time.
 533                 *
 534                 * Waiting for the lock may lead to deadlock in the
 535                 * reclaim path.
 536                 */
 537                if (!trylock_page(page)) {
 538                        put_page(page);
 539                        goto leave;
 540                }
 541
 542                ret = split_huge_page(page);
 543                unlock_page(page);
 544                put_page(page);
 545
 546                /* If split failed leave the inode on the list */
 547                if (ret)
 548                        goto leave;
 549
 550                split++;
 551drop:
 552                list_del_init(&info->shrinklist);
 553                removed++;
 554leave:
 555                iput(inode);
 556        }
 557
 558        spin_lock(&sbinfo->shrinklist_lock);
 559        list_splice_tail(&list, &sbinfo->shrinklist);
 560        sbinfo->shrinklist_len -= removed;
 561        spin_unlock(&sbinfo->shrinklist_lock);
 562
 563        return split;
 564}
 565
 566static long shmem_unused_huge_scan(struct super_block *sb,
 567                struct shrink_control *sc)
 568{
 569        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
 570
 571        if (!READ_ONCE(sbinfo->shrinklist_len))
 572                return SHRINK_STOP;
 573
 574        return shmem_unused_huge_shrink(sbinfo, sc, 0);
 575}
 576
 577static long shmem_unused_huge_count(struct super_block *sb,
 578                struct shrink_control *sc)
 579{
 580        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
 581        return READ_ONCE(sbinfo->shrinklist_len);
 582}
 583#else /* !CONFIG_TRANSPARENT_HUGE_PAGECACHE */
 584
 585#define shmem_huge SHMEM_HUGE_DENY
 586
 587static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
 588                struct shrink_control *sc, unsigned long nr_to_split)
 589{
 590        return 0;
 591}
 592#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */
 593
 594static inline bool is_huge_enabled(struct shmem_sb_info *sbinfo)
 595{
 596        if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
 597            (shmem_huge == SHMEM_HUGE_FORCE || sbinfo->huge) &&
 598            shmem_huge != SHMEM_HUGE_DENY)
 599                return true;
 600        return false;
 601}
 602
 603/*
 604 * Like add_to_page_cache_locked, but error if expected item has gone.
 605 */
 606static int shmem_add_to_page_cache(struct page *page,
 607                                   struct address_space *mapping,
 608                                   pgoff_t index, void *expected, gfp_t gfp)
 609{
 610        XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page));
 611        unsigned long i = 0;
 612        unsigned long nr = compound_nr(page);
 613
 614        VM_BUG_ON_PAGE(PageTail(page), page);
 615        VM_BUG_ON_PAGE(index != round_down(index, nr), page);
 616        VM_BUG_ON_PAGE(!PageLocked(page), page);
 617        VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
 618        VM_BUG_ON(expected && PageTransHuge(page));
 619
 620        page_ref_add(page, nr);
 621        page->mapping = mapping;
 622        page->index = index;
 623
 624        do {
 625                void *entry;
 626                xas_lock_irq(&xas);
 627                entry = xas_find_conflict(&xas);
 628                if (entry != expected)
 629                        xas_set_err(&xas, -EEXIST);
 630                xas_create_range(&xas);
 631                if (xas_error(&xas))
 632                        goto unlock;
 633next:
 634                xas_store(&xas, page);
 635                if (++i < nr) {
 636                        xas_next(&xas);
 637                        goto next;
 638                }
 639                if (PageTransHuge(page)) {
 640                        count_vm_event(THP_FILE_ALLOC);
 641                        __inc_node_page_state(page, NR_SHMEM_THPS);
 642                }
 643                mapping->nrpages += nr;
 644                __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
 645                __mod_node_page_state(page_pgdat(page), NR_SHMEM, nr);
 646unlock:
 647                xas_unlock_irq(&xas);
 648        } while (xas_nomem(&xas, gfp));
 649
 650        if (xas_error(&xas)) {
 651                page->mapping = NULL;
 652                page_ref_sub(page, nr);
 653                return xas_error(&xas);
 654        }
 655
 656        return 0;
 657}
 658
 659/*
 660 * Like delete_from_page_cache, but substitutes swap for page.
 661 */
 662static void shmem_delete_from_page_cache(struct page *page, void *radswap)
 663{
 664        struct address_space *mapping = page->mapping;
 665        int error;
 666
 667        VM_BUG_ON_PAGE(PageCompound(page), page);
 668
 669        xa_lock_irq(&mapping->i_pages);
 670        error = shmem_replace_entry(mapping, page->index, page, radswap);
 671        page->mapping = NULL;
 672        mapping->nrpages--;
 673        __dec_node_page_state(page, NR_FILE_PAGES);
 674        __dec_node_page_state(page, NR_SHMEM);
 675        xa_unlock_irq(&mapping->i_pages);
 676        put_page(page);
 677        BUG_ON(error);
 678}
 679
 680/*
 681 * Remove swap entry from page cache, free the swap and its page cache.
 682 */
 683static int shmem_free_swap(struct address_space *mapping,
 684                           pgoff_t index, void *radswap)
 685{
 686        void *old;
 687
 688        old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0);
 689        if (old != radswap)
 690                return -ENOENT;
 691        free_swap_and_cache(radix_to_swp_entry(radswap));
 692        return 0;
 693}
 694
 695/*
 696 * Determine (in bytes) how many of the shmem object's pages mapped by the
 697 * given offsets are swapped out.
 698 *
 699 * This is safe to call without i_mutex or the i_pages lock thanks to RCU,
 700 * as long as the inode doesn't go away and racy results are not a problem.
 701 */
 702unsigned long shmem_partial_swap_usage(struct address_space *mapping,
 703                                                pgoff_t start, pgoff_t end)
 704{
 705        XA_STATE(xas, &mapping->i_pages, start);
 706        struct page *page;
 707        unsigned long swapped = 0;
 708
 709        rcu_read_lock();
 710        xas_for_each(&xas, page, end - 1) {
 711                if (xas_retry(&xas, page))
 712                        continue;
 713                if (xa_is_value(page))
 714                        swapped++;
 715
 716                if (need_resched()) {
 717                        xas_pause(&xas);
 718                        cond_resched_rcu();
 719                }
 720        }
 721
 722        rcu_read_unlock();
 723
 724        return swapped << PAGE_SHIFT;
 725}
 726
 727/*
 728 * Determine (in bytes) how many of the shmem object's pages mapped by the
 729 * given vma is swapped out.
 730 *
 731 * This is safe to call without i_mutex or the i_pages lock thanks to RCU,
 732 * as long as the inode doesn't go away and racy results are not a problem.
 733 */
 734unsigned long shmem_swap_usage(struct vm_area_struct *vma)
 735{
 736        struct inode *inode = file_inode(vma->vm_file);
 737        struct shmem_inode_info *info = SHMEM_I(inode);
 738        struct address_space *mapping = inode->i_mapping;
 739        unsigned long swapped;
 740
 741        /* Be careful as we don't hold info->lock */
 742        swapped = READ_ONCE(info->swapped);
 743
 744        /*
 745         * The easier cases are when the shmem object has nothing in swap, or
 746         * the vma maps it whole. Then we can simply use the stats that we
 747         * already track.
 748         */
 749        if (!swapped)
 750                return 0;
 751
 752        if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size)
 753                return swapped << PAGE_SHIFT;
 754
 755        /* Here comes the more involved part */
 756        return shmem_partial_swap_usage(mapping,
 757                        linear_page_index(vma, vma->vm_start),
 758                        linear_page_index(vma, vma->vm_end));
 759}
 760
 761/*
 762 * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists.
 763 */
 764void shmem_unlock_mapping(struct address_space *mapping)
 765{
 766        struct pagevec pvec;
 767        pgoff_t indices[PAGEVEC_SIZE];
 768        pgoff_t index = 0;
 769
 770        pagevec_init(&pvec);
 771        /*
 772         * Minor point, but we might as well stop if someone else SHM_LOCKs it.
 773         */
 774        while (!mapping_unevictable(mapping)) {
 775                /*
 776                 * Avoid pagevec_lookup(): find_get_pages() returns 0 as if it
 777                 * has finished, if it hits a row of PAGEVEC_SIZE swap entries.
 778                 */
 779                pvec.nr = find_get_entries(mapping, index,
 780                                           PAGEVEC_SIZE, pvec.pages, indices);
 781                if (!pvec.nr)
 782                        break;
 783                index = indices[pvec.nr - 1] + 1;
 784                pagevec_remove_exceptionals(&pvec);
 785                check_move_unevictable_pages(&pvec);
 786                pagevec_release(&pvec);
 787                cond_resched();
 788        }
 789}
 790
 791/*
 792 * Remove range of pages and swap entries from page cache, and free them.
 793 * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
 794 */
 795static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
 796                                                                 bool unfalloc)
 797{
 798        struct address_space *mapping = inode->i_mapping;
 799        struct shmem_inode_info *info = SHMEM_I(inode);
 800        pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT;
 801        pgoff_t end = (lend + 1) >> PAGE_SHIFT;
 802        unsigned int partial_start = lstart & (PAGE_SIZE - 1);
 803        unsigned int partial_end = (lend + 1) & (PAGE_SIZE - 1);
 804        struct pagevec pvec;
 805        pgoff_t indices[PAGEVEC_SIZE];
 806        long nr_swaps_freed = 0;
 807        pgoff_t index;
 808        int i;
 809
 810        if (lend == -1)
 811                end = -1;       /* unsigned, so actually very big */
 812
 813        pagevec_init(&pvec);
 814        index = start;
 815        while (index < end) {
 816                pvec.nr = find_get_entries(mapping, index,
 817                        min(end - index, (pgoff_t)PAGEVEC_SIZE),
 818                        pvec.pages, indices);
 819                if (!pvec.nr)
 820                        break;
 821                for (i = 0; i < pagevec_count(&pvec); i++) {
 822                        struct page *page = pvec.pages[i];
 823
 824                        index = indices[i];
 825                        if (index >= end)
 826                                break;
 827
 828                        if (xa_is_value(page)) {
 829                                if (unfalloc)
 830                                        continue;
 831                                nr_swaps_freed += !shmem_free_swap(mapping,
 832                                                                index, page);
 833                                continue;
 834                        }
 835
 836                        VM_BUG_ON_PAGE(page_to_pgoff(page) != index, page);
 837
 838                        if (!trylock_page(page))
 839                                continue;
 840
 841                        if (PageTransTail(page)) {
 842                                /* Middle of THP: zero out the page */
 843                                clear_highpage(page);
 844                                unlock_page(page);
 845                                continue;
 846                        } else if (PageTransHuge(page)) {
 847                                if (index == round_down(end, HPAGE_PMD_NR)) {
 848                                        /*
 849                                         * Range ends in the middle of THP:
 850                                         * zero out the page
 851                                         */
 852                                        clear_highpage(page);
 853                                        unlock_page(page);
 854                                        continue;
 855                                }
 856                                index += HPAGE_PMD_NR - 1;
 857                                i += HPAGE_PMD_NR - 1;
 858                        }
 859
 860                        if (!unfalloc || !PageUptodate(page)) {
 861                                VM_BUG_ON_PAGE(PageTail(page), page);
 862                                if (page_mapping(page) == mapping) {
 863                                        VM_BUG_ON_PAGE(PageWriteback(page), page);
 864                                        truncate_inode_page(mapping, page);
 865                                }
 866                        }
 867                        unlock_page(page);
 868                }
 869                pagevec_remove_exceptionals(&pvec);
 870                pagevec_release(&pvec);
 871                cond_resched();
 872                index++;
 873        }
 874
 875        if (partial_start) {
 876                struct page *page = NULL;
 877                shmem_getpage(inode, start - 1, &page, SGP_READ);
 878                if (page) {
 879                        unsigned int top = PAGE_SIZE;
 880                        if (start > end) {
 881                                top = partial_end;
 882                                partial_end = 0;
 883                        }
 884                        zero_user_segment(page, partial_start, top);
 885                        set_page_dirty(page);
 886                        unlock_page(page);
 887                        put_page(page);
 888                }
 889        }
 890        if (partial_end) {
 891                struct page *page = NULL;
 892                shmem_getpage(inode, end, &page, SGP_READ);
 893                if (page) {
 894                        zero_user_segment(page, 0, partial_end);
 895                        set_page_dirty(page);
 896                        unlock_page(page);
 897                        put_page(page);
 898                }
 899        }
 900        if (start >= end)
 901                return;
 902
 903        index = start;
 904        while (index < end) {
 905                cond_resched();
 906
 907                pvec.nr = find_get_entries(mapping, index,
 908                                min(end - index, (pgoff_t)PAGEVEC_SIZE),
 909                                pvec.pages, indices);
 910                if (!pvec.nr) {
 911                        /* If all gone or hole-punch or unfalloc, we're done */
 912                        if (index == start || end != -1)
 913                                break;
 914                        /* But if truncating, restart to make sure all gone */
 915                        index = start;
 916                        continue;
 917                }
 918                for (i = 0; i < pagevec_count(&pvec); i++) {
 919                        struct page *page = pvec.pages[i];
 920
 921                        index = indices[i];
 922                        if (index >= end)
 923                                break;
 924
 925                        if (xa_is_value(page)) {
 926                                if (unfalloc)
 927                                        continue;
 928                                if (shmem_free_swap(mapping, index, page)) {
 929                                        /* Swap was replaced by page: retry */
 930                                        index--;
 931                                        break;
 932                                }
 933                                nr_swaps_freed++;
 934                                continue;
 935                        }
 936
 937                        lock_page(page);
 938
 939                        if (PageTransTail(page)) {
 940                                /* Middle of THP: zero out the page */
 941                                clear_highpage(page);
 942                                unlock_page(page);
 943                                /*
 944                                 * Partial thp truncate due 'start' in middle
 945                                 * of THP: don't need to look on these pages
 946                                 * again on !pvec.nr restart.
 947                                 */
 948                                if (index != round_down(end, HPAGE_PMD_NR))
 949                                        start++;
 950                                continue;
 951                        } else if (PageTransHuge(page)) {
 952                                if (index == round_down(end, HPAGE_PMD_NR)) {
 953                                        /*
 954                                         * Range ends in the middle of THP:
 955                                         * zero out the page
 956                                         */
 957                                        clear_highpage(page);
 958                                        unlock_page(page);
 959                                        continue;
 960                                }
 961                                index += HPAGE_PMD_NR - 1;
 962                                i += HPAGE_PMD_NR - 1;
 963                        }
 964
 965                        if (!unfalloc || !PageUptodate(page)) {
 966                                VM_BUG_ON_PAGE(PageTail(page), page);
 967                                if (page_mapping(page) == mapping) {
 968                                        VM_BUG_ON_PAGE(PageWriteback(page), page);
 969                                        truncate_inode_page(mapping, page);
 970                                } else {
 971                                        /* Page was replaced by swap: retry */
 972                                        unlock_page(page);
 973                                        index--;
 974                                        break;
 975                                }
 976                        }
 977                        unlock_page(page);
 978                }
 979                pagevec_remove_exceptionals(&pvec);
 980                pagevec_release(&pvec);
 981                index++;
 982        }
 983
 984        spin_lock_irq(&info->lock);
 985        info->swapped -= nr_swaps_freed;
 986        shmem_recalc_inode(inode);
 987        spin_unlock_irq(&info->lock);
 988}
 989
 990void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
 991{
 992        shmem_undo_range(inode, lstart, lend, false);
 993        inode->i_ctime = inode->i_mtime = current_time(inode);
 994}
 995EXPORT_SYMBOL_GPL(shmem_truncate_range);
 996
 997static int shmem_getattr(const struct path *path, struct kstat *stat,
 998                         u32 request_mask, unsigned int query_flags)
 999{
1000        struct inode *inode = path->dentry->d_inode;
1001        struct shmem_inode_info *info = SHMEM_I(inode);
1002        struct shmem_sb_info *sb_info = SHMEM_SB(inode->i_sb);
1003
1004        if (info->alloced - info->swapped != inode->i_mapping->nrpages) {
1005                spin_lock_irq(&info->lock);
1006                shmem_recalc_inode(inode);
1007                spin_unlock_irq(&info->lock);
1008        }
1009        generic_fillattr(inode, stat);
1010
1011        if (is_huge_enabled(sb_info))
1012                stat->blksize = HPAGE_PMD_SIZE;
1013
1014        return 0;
1015}
1016
1017static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
1018{
1019        struct inode *inode = d_inode(dentry);
1020        struct shmem_inode_info *info = SHMEM_I(inode);
1021        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
1022        int error;
1023
1024        error = setattr_prepare(dentry, attr);
1025        if (error)
1026                return error;
1027
1028        if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
1029                loff_t oldsize = inode->i_size;
1030                loff_t newsize = attr->ia_size;
1031
1032                /* protected by i_mutex */
1033                if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) ||
1034                    (newsize > oldsize && (info->seals & F_SEAL_GROW)))
1035                        return -EPERM;
1036
1037                if (newsize != oldsize) {
1038                        error = shmem_reacct_size(SHMEM_I(inode)->flags,
1039                                        oldsize, newsize);
1040                        if (error)
1041                                return error;
1042                        i_size_write(inode, newsize);
1043                        inode->i_ctime = inode->i_mtime = current_time(inode);
1044                }
1045                if (newsize <= oldsize) {
1046                        loff_t holebegin = round_up(newsize, PAGE_SIZE);
1047                        if (oldsize > holebegin)
1048                                unmap_mapping_range(inode->i_mapping,
1049                                                        holebegin, 0, 1);
1050                        if (info->alloced)
1051                                shmem_truncate_range(inode,
1052                                                        newsize, (loff_t)-1);
1053                        /* unmap again to remove racily COWed private pages */
1054                        if (oldsize > holebegin)
1055                                unmap_mapping_range(inode->i_mapping,
1056                                                        holebegin, 0, 1);
1057
1058                        /*
1059                         * Part of the huge page can be beyond i_size: subject
1060                         * to shrink under memory pressure.
1061                         */
1062                        if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) {
1063                                spin_lock(&sbinfo->shrinklist_lock);
1064                                /*
1065                                 * _careful to defend against unlocked access to
1066                                 * ->shrink_list in shmem_unused_huge_shrink()
1067                                 */
1068                                if (list_empty_careful(&info->shrinklist)) {
1069                                        list_add_tail(&info->shrinklist,
1070                                                        &sbinfo->shrinklist);
1071                                        sbinfo->shrinklist_len++;
1072                                }
1073                                spin_unlock(&sbinfo->shrinklist_lock);
1074                        }
1075                }
1076        }
1077
1078        setattr_copy(inode, attr);
1079        if (attr->ia_valid & ATTR_MODE)
1080                error = posix_acl_chmod(inode, inode->i_mode);
1081        return error;
1082}
1083
1084static void shmem_evict_inode(struct inode *inode)
1085{
1086        struct shmem_inode_info *info = SHMEM_I(inode);
1087        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
1088
1089        if (inode->i_mapping->a_ops == &shmem_aops) {
1090                shmem_unacct_size(info->flags, inode->i_size);
1091                inode->i_size = 0;
1092                shmem_truncate_range(inode, 0, (loff_t)-1);
1093                if (!list_empty(&info->shrinklist)) {
1094                        spin_lock(&sbinfo->shrinklist_lock);
1095                        if (!list_empty(&info->shrinklist)) {
1096                                list_del_init(&info->shrinklist);
1097                                sbinfo->shrinklist_len--;
1098                        }
1099                        spin_unlock(&sbinfo->shrinklist_lock);
1100                }
1101                while (!list_empty(&info->swaplist)) {
1102                        /* Wait while shmem_unuse() is scanning this inode... */
1103                        wait_var_event(&info->stop_eviction,
1104                                       !atomic_read(&info->stop_eviction));
1105                        mutex_lock(&shmem_swaplist_mutex);
1106                        /* ...but beware of the race if we peeked too early */
1107                        if (!atomic_read(&info->stop_eviction))
1108                                list_del_init(&info->swaplist);
1109                        mutex_unlock(&shmem_swaplist_mutex);
1110                }
1111        }
1112
1113        simple_xattrs_free(&info->xattrs);
1114        WARN_ON(inode->i_blocks);
1115        shmem_free_inode(inode->i_sb);
1116        clear_inode(inode);
1117}
1118
1119extern struct swap_info_struct *swap_info[];
1120
1121static int shmem_find_swap_entries(struct address_space *mapping,
1122                                   pgoff_t start, unsigned int nr_entries,
1123                                   struct page **entries, pgoff_t *indices,
1124                                   unsigned int type, bool frontswap)
1125{
1126        XA_STATE(xas, &mapping->i_pages, start);
1127        struct page *page;
1128        swp_entry_t entry;
1129        unsigned int ret = 0;
1130
1131        if (!nr_entries)
1132                return 0;
1133
1134        rcu_read_lock();
1135        xas_for_each(&xas, page, ULONG_MAX) {
1136                if (xas_retry(&xas, page))
1137                        continue;
1138
1139                if (!xa_is_value(page))
1140                        continue;
1141
1142                entry = radix_to_swp_entry(page);
1143                if (swp_type(entry) != type)
1144                        continue;
1145                if (frontswap &&
1146                    !frontswap_test(swap_info[type], swp_offset(entry)))
1147                        continue;
1148
1149                indices[ret] = xas.xa_index;
1150                entries[ret] = page;
1151
1152                if (need_resched()) {
1153                        xas_pause(&xas);
1154                        cond_resched_rcu();
1155                }
1156                if (++ret == nr_entries)
1157                        break;
1158        }
1159        rcu_read_unlock();
1160
1161        return ret;
1162}
1163
1164/*
1165 * Move the swapped pages for an inode to page cache. Returns the count
1166 * of pages swapped in, or the error in case of failure.
1167 */
1168static int shmem_unuse_swap_entries(struct inode *inode, struct pagevec pvec,
1169                                    pgoff_t *indices)
1170{
1171        int i = 0;
1172        int ret = 0;
1173        int error = 0;
1174        struct address_space *mapping = inode->i_mapping;
1175
1176        for (i = 0; i < pvec.nr; i++) {
1177                struct page *page = pvec.pages[i];
1178
1179                if (!xa_is_value(page))
1180                        continue;
1181                error = shmem_swapin_page(inode, indices[i],
1182                                          &page, SGP_CACHE,
1183                                          mapping_gfp_mask(mapping),
1184                                          NULL, NULL);
1185                if (error == 0) {
1186                        unlock_page(page);
1187                        put_page(page);
1188                        ret++;
1189                }
1190                if (error == -ENOMEM)
1191                        break;
1192                error = 0;
1193        }
1194        return error ? error : ret;
1195}
1196
1197/*
1198 * If swap found in inode, free it and move page from swapcache to filecache.
1199 */
1200static int shmem_unuse_inode(struct inode *inode, unsigned int type,
1201                             bool frontswap, unsigned long *fs_pages_to_unuse)
1202{
1203        struct address_space *mapping = inode->i_mapping;
1204        pgoff_t start = 0;
1205        struct pagevec pvec;
1206        pgoff_t indices[PAGEVEC_SIZE];
1207        bool frontswap_partial = (frontswap && *fs_pages_to_unuse > 0);
1208        int ret = 0;
1209
1210        pagevec_init(&pvec);
1211        do {
1212                unsigned int nr_entries = PAGEVEC_SIZE;
1213
1214                if (frontswap_partial && *fs_pages_to_unuse < PAGEVEC_SIZE)
1215                        nr_entries = *fs_pages_to_unuse;
1216
1217                pvec.nr = shmem_find_swap_entries(mapping, start, nr_entries,
1218                                                  pvec.pages, indices,
1219                                                  type, frontswap);
1220                if (pvec.nr == 0) {
1221                        ret = 0;
1222                        break;
1223                }
1224
1225                ret = shmem_unuse_swap_entries(inode, pvec, indices);
1226                if (ret < 0)
1227                        break;
1228
1229                if (frontswap_partial) {
1230                        *fs_pages_to_unuse -= ret;
1231                        if (*fs_pages_to_unuse == 0) {
1232                                ret = FRONTSWAP_PAGES_UNUSED;
1233                                break;
1234                        }
1235                }
1236
1237                start = indices[pvec.nr - 1];
1238        } while (true);
1239
1240        return ret;
1241}
1242
1243/*
1244 * Read all the shared memory data that resides in the swap
1245 * device 'type' back into memory, so the swap device can be
1246 * unused.
1247 */
1248int shmem_unuse(unsigned int type, bool frontswap,
1249                unsigned long *fs_pages_to_unuse)
1250{
1251        struct shmem_inode_info *info, *next;
1252        int error = 0;
1253
1254        if (list_empty(&shmem_swaplist))
1255                return 0;
1256
1257        mutex_lock(&shmem_swaplist_mutex);
1258        list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) {
1259                if (!info->swapped) {
1260                        list_del_init(&info->swaplist);
1261                        continue;
1262                }
1263                /*
1264                 * Drop the swaplist mutex while searching the inode for swap;
1265                 * but before doing so, make sure shmem_evict_inode() will not
1266                 * remove placeholder inode from swaplist, nor let it be freed
1267                 * (igrab() would protect from unlink, but not from unmount).
1268                 */
1269                atomic_inc(&info->stop_eviction);
1270                mutex_unlock(&shmem_swaplist_mutex);
1271
1272                error = shmem_unuse_inode(&info->vfs_inode, type, frontswap,
1273                                          fs_pages_to_unuse);
1274                cond_resched();
1275
1276                mutex_lock(&shmem_swaplist_mutex);
1277                next = list_next_entry(info, swaplist);
1278                if (!info->swapped)
1279                        list_del_init(&info->swaplist);
1280                if (atomic_dec_and_test(&info->stop_eviction))
1281                        wake_up_var(&info->stop_eviction);
1282                if (error)
1283                        break;
1284        }
1285        mutex_unlock(&shmem_swaplist_mutex);
1286
1287        return error;
1288}
1289
1290/*
1291 * Move the page from the page cache to the swap cache.
1292 */
1293static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1294{
1295        struct shmem_inode_info *info;
1296        struct address_space *mapping;
1297        struct inode *inode;
1298        swp_entry_t swap;
1299        pgoff_t index;
1300
1301        VM_BUG_ON_PAGE(PageCompound(page), page);
1302        BUG_ON(!PageLocked(page));
1303        mapping = page->mapping;
1304        index = page->index;
1305        inode = mapping->host;
1306        info = SHMEM_I(inode);
1307        if (info->flags & VM_LOCKED)
1308                goto redirty;
1309        if (!total_swap_pages)
1310                goto redirty;
1311
1312        /*
1313         * Our capabilities prevent regular writeback or sync from ever calling
1314         * shmem_writepage; but a stacking filesystem might use ->writepage of
1315         * its underlying filesystem, in which case tmpfs should write out to
1316         * swap only in response to memory pressure, and not for the writeback
1317         * threads or sync.
1318         */
1319        if (!wbc->for_reclaim) {
1320                WARN_ON_ONCE(1);        /* Still happens? Tell us about it! */
1321                goto redirty;
1322        }
1323
1324        /*
1325         * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC
1326         * value into swapfile.c, the only way we can correctly account for a
1327         * fallocated page arriving here is now to initialize it and write it.
1328         *
1329         * That's okay for a page already fallocated earlier, but if we have
1330         * not yet completed the fallocation, then (a) we want to keep track
1331         * of this page in case we have to undo it, and (b) it may not be a
1332         * good idea to continue anyway, once we're pushing into swap.  So
1333         * reactivate the page, and let shmem_fallocate() quit when too many.
1334         */
1335        if (!PageUptodate(page)) {
1336                if (inode->i_private) {
1337                        struct shmem_falloc *shmem_falloc;
1338                        spin_lock(&inode->i_lock);
1339                        shmem_falloc = inode->i_private;
1340                        if (shmem_falloc &&
1341                            !shmem_falloc->waitq &&
1342                            index >= shmem_falloc->start &&
1343                            index < shmem_falloc->next)
1344                                shmem_falloc->nr_unswapped++;
1345                        else
1346                                shmem_falloc = NULL;
1347                        spin_unlock(&inode->i_lock);
1348                        if (shmem_falloc)
1349                                goto redirty;
1350                }
1351                clear_highpage(page);
1352                flush_dcache_page(page);
1353                SetPageUptodate(page);
1354        }
1355
1356        swap = get_swap_page(page);
1357        if (!swap.val)
1358                goto redirty;
1359
1360        /*
1361         * Add inode to shmem_unuse()'s list of swapped-out inodes,
1362         * if it's not already there.  Do it now before the page is
1363         * moved to swap cache, when its pagelock no longer protects
1364         * the inode from eviction.  But don't unlock the mutex until
1365         * we've incremented swapped, because shmem_unuse_inode() will
1366         * prune a !swapped inode from the swaplist under this mutex.
1367         */
1368        mutex_lock(&shmem_swaplist_mutex);
1369        if (list_empty(&info->swaplist))
1370                list_add(&info->swaplist, &shmem_swaplist);
1371
1372        if (add_to_swap_cache(page, swap,
1373                        __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN) == 0) {
1374                spin_lock_irq(&info->lock);
1375                shmem_recalc_inode(inode);
1376                info->swapped++;
1377                spin_unlock_irq(&info->lock);
1378
1379                swap_shmem_alloc(swap);
1380                shmem_delete_from_page_cache(page, swp_to_radix_entry(swap));
1381
1382                mutex_unlock(&shmem_swaplist_mutex);
1383                BUG_ON(page_mapped(page));
1384                swap_writepage(page, wbc);
1385                return 0;
1386        }
1387
1388        mutex_unlock(&shmem_swaplist_mutex);
1389        put_swap_page(page, swap);
1390redirty:
1391        set_page_dirty(page);
1392        if (wbc->for_reclaim)
1393                return AOP_WRITEPAGE_ACTIVATE;  /* Return with page locked */
1394        unlock_page(page);
1395        return 0;
1396}
1397
1398#if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS)
1399static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
1400{
1401        char buffer[64];
1402
1403        if (!mpol || mpol->mode == MPOL_DEFAULT)
1404                return;         /* show nothing */
1405
1406        mpol_to_str(buffer, sizeof(buffer), mpol);
1407
1408        seq_printf(seq, ",mpol=%s", buffer);
1409}
1410
1411static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
1412{
1413        struct mempolicy *mpol = NULL;
1414        if (sbinfo->mpol) {
1415                spin_lock(&sbinfo->stat_lock);  /* prevent replace/use races */
1416                mpol = sbinfo->mpol;
1417                mpol_get(mpol);
1418                spin_unlock(&sbinfo->stat_lock);
1419        }
1420        return mpol;
1421}
1422#else /* !CONFIG_NUMA || !CONFIG_TMPFS */
1423static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
1424{
1425}
1426static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
1427{
1428        return NULL;
1429}
1430#endif /* CONFIG_NUMA && CONFIG_TMPFS */
1431#ifndef CONFIG_NUMA
1432#define vm_policy vm_private_data
1433#endif
1434
1435static void shmem_pseudo_vma_init(struct vm_area_struct *vma,
1436                struct shmem_inode_info *info, pgoff_t index)
1437{
1438        /* Create a pseudo vma that just contains the policy */
1439        vma_init(vma, NULL);
1440        /* Bias interleave by inode number to distribute better across nodes */
1441        vma->vm_pgoff = index + info->vfs_inode.i_ino;
1442        vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index);
1443}
1444
1445static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma)
1446{
1447        /* Drop reference taken by mpol_shared_policy_lookup() */
1448        mpol_cond_put(vma->vm_policy);
1449}
1450
1451static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
1452                        struct shmem_inode_info *info, pgoff_t index)
1453{
1454        struct vm_area_struct pvma;
1455        struct page *page;
1456        struct vm_fault vmf;
1457
1458        shmem_pseudo_vma_init(&pvma, info, index);
1459        vmf.vma = &pvma;
1460        vmf.address = 0;
1461        page = swap_cluster_readahead(swap, gfp, &vmf);
1462        shmem_pseudo_vma_destroy(&pvma);
1463
1464        return page;
1465}
1466
1467static struct page *shmem_alloc_hugepage(gfp_t gfp,
1468                struct shmem_inode_info *info, pgoff_t index)
1469{
1470        struct vm_area_struct pvma;
1471        struct address_space *mapping = info->vfs_inode.i_mapping;
1472        pgoff_t hindex;
1473        struct page *page;
1474
1475        if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
1476                return NULL;
1477
1478        hindex = round_down(index, HPAGE_PMD_NR);
1479        if (xa_find(&mapping->i_pages, &hindex, hindex + HPAGE_PMD_NR - 1,
1480                                                                XA_PRESENT))
1481                return NULL;
1482
1483        shmem_pseudo_vma_init(&pvma, info, hindex);
1484        page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN,
1485                        HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true);
1486        shmem_pseudo_vma_destroy(&pvma);
1487        if (page)
1488                prep_transhuge_page(page);
1489        return page;
1490}
1491
1492static struct page *shmem_alloc_page(gfp_t gfp,
1493                        struct shmem_inode_info *info, pgoff_t index)
1494{
1495        struct vm_area_struct pvma;
1496        struct page *page;
1497
1498        shmem_pseudo_vma_init(&pvma, info, index);
1499        page = alloc_page_vma(gfp, &pvma, 0);
1500        shmem_pseudo_vma_destroy(&pvma);
1501
1502        return page;
1503}
1504
1505static struct page *shmem_alloc_and_acct_page(gfp_t gfp,
1506                struct inode *inode,
1507                pgoff_t index, bool huge)
1508{
1509        struct shmem_inode_info *info = SHMEM_I(inode);
1510        struct page *page;
1511        int nr;
1512        int err = -ENOSPC;
1513
1514        if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
1515                huge = false;
1516        nr = huge ? HPAGE_PMD_NR : 1;
1517
1518        if (!shmem_inode_acct_block(inode, nr))
1519                goto failed;
1520
1521        if (huge)
1522                page = shmem_alloc_hugepage(gfp, info, index);
1523        else
1524                page = shmem_alloc_page(gfp, info, index);
1525        if (page) {
1526                __SetPageLocked(page);
1527                __SetPageSwapBacked(page);
1528                return page;
1529        }
1530
1531        err = -ENOMEM;
1532        shmem_inode_unacct_blocks(inode, nr);
1533failed:
1534        return ERR_PTR(err);
1535}
1536
1537/*
1538 * When a page is moved from swapcache to shmem filecache (either by the
1539 * usual swapin of shmem_getpage_gfp(), or by the less common swapoff of
1540 * shmem_unuse_inode()), it may have been read in earlier from swap, in
1541 * ignorance of the mapping it belongs to.  If that mapping has special
1542 * constraints (like the gma500 GEM driver, which requires RAM below 4GB),
1543 * we may need to copy to a suitable page before moving to filecache.
1544 *
1545 * In a future release, this may well be extended to respect cpuset and
1546 * NUMA mempolicy, and applied also to anonymous pages in do_swap_page();
1547 * but for now it is a simple matter of zone.
1548 */
1549static bool shmem_should_replace_page(struct page *page, gfp_t gfp)
1550{
1551        return page_zonenum(page) > gfp_zone(gfp);
1552}
1553
1554static int shmem_replace_page(struct page **pagep, gfp_t gfp,
1555                                struct shmem_inode_info *info, pgoff_t index)
1556{
1557        struct page *oldpage, *newpage;
1558        struct address_space *swap_mapping;
1559        swp_entry_t entry;
1560        pgoff_t swap_index;
1561        int error;
1562
1563        oldpage = *pagep;
1564        entry.val = page_private(oldpage);
1565        swap_index = swp_offset(entry);
1566        swap_mapping = page_mapping(oldpage);
1567
1568        /*
1569         * We have arrived here because our zones are constrained, so don't
1570         * limit chance of success by further cpuset and node constraints.
1571         */
1572        gfp &= ~GFP_CONSTRAINT_MASK;
1573        newpage = shmem_alloc_page(gfp, info, index);
1574        if (!newpage)
1575                return -ENOMEM;
1576
1577        get_page(newpage);
1578        copy_highpage(newpage, oldpage);
1579        flush_dcache_page(newpage);
1580
1581        __SetPageLocked(newpage);
1582        __SetPageSwapBacked(newpage);
1583        SetPageUptodate(newpage);
1584        set_page_private(newpage, entry.val);
1585        SetPageSwapCache(newpage);
1586
1587        /*
1588         * Our caller will very soon move newpage out of swapcache, but it's
1589         * a nice clean interface for us to replace oldpage by newpage there.
1590         */
1591        xa_lock_irq(&swap_mapping->i_pages);
1592        error = shmem_replace_entry(swap_mapping, swap_index, oldpage, newpage);
1593        if (!error) {
1594                __inc_node_page_state(newpage, NR_FILE_PAGES);
1595                __dec_node_page_state(oldpage, NR_FILE_PAGES);
1596        }
1597        xa_unlock_irq(&swap_mapping->i_pages);
1598
1599        if (unlikely(error)) {
1600                /*
1601                 * Is this possible?  I think not, now that our callers check
1602                 * both PageSwapCache and page_private after getting page lock;
1603                 * but be defensive.  Reverse old to newpage for clear and free.
1604                 */
1605                oldpage = newpage;
1606        } else {
1607                mem_cgroup_migrate(oldpage, newpage);
1608                lru_cache_add_anon(newpage);
1609                *pagep = newpage;
1610        }
1611
1612        ClearPageSwapCache(oldpage);
1613        set_page_private(oldpage, 0);
1614
1615        unlock_page(oldpage);
1616        put_page(oldpage);
1617        put_page(oldpage);
1618        return error;
1619}
1620
1621/*
1622 * Swap in the page pointed to by *pagep.
1623 * Caller has to make sure that *pagep contains a valid swapped page.
1624 * Returns 0 and the page in pagep if success. On failure, returns the
1625 * the error code and NULL in *pagep.
1626 */
1627static int shmem_swapin_page(struct inode *inode, pgoff_t index,
1628                             struct page **pagep, enum sgp_type sgp,
1629                             gfp_t gfp, struct vm_area_struct *vma,
1630                             vm_fault_t *fault_type)
1631{
1632        struct address_space *mapping = inode->i_mapping;
1633        struct shmem_inode_info *info = SHMEM_I(inode);
1634        struct mm_struct *charge_mm = vma ? vma->vm_mm : current->mm;
1635        struct mem_cgroup *memcg;
1636        struct page *page;
1637        swp_entry_t swap;
1638        int error;
1639
1640        VM_BUG_ON(!*pagep || !xa_is_value(*pagep));
1641        swap = radix_to_swp_entry(*pagep);
1642        *pagep = NULL;
1643
1644        /* Look it up and read it in.. */
1645        page = lookup_swap_cache(swap, NULL, 0);
1646        if (!page) {
1647                /* Or update major stats only when swapin succeeds?? */
1648                if (fault_type) {
1649                        *fault_type |= VM_FAULT_MAJOR;
1650                        count_vm_event(PGMAJFAULT);
1651                        count_memcg_event_mm(charge_mm, PGMAJFAULT);
1652                }
1653                /* Here we actually start the io */
1654                page = shmem_swapin(swap, gfp, info, index);
1655                if (!page) {
1656                        error = -ENOMEM;
1657                        goto failed;
1658                }
1659        }
1660
1661        /* We have to do this with page locked to prevent races */
1662        lock_page(page);
1663        if (!PageSwapCache(page) || page_private(page) != swap.val ||
1664            !shmem_confirm_swap(mapping, index, swap)) {
1665                error = -EEXIST;
1666                goto unlock;
1667        }
1668        if (!PageUptodate(page)) {
1669                error = -EIO;
1670                goto failed;
1671        }
1672        wait_on_page_writeback(page);
1673
1674        if (shmem_should_replace_page(page, gfp)) {
1675                error = shmem_replace_page(&page, gfp, info, index);
1676                if (error)
1677                        goto failed;
1678        }
1679
1680        error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
1681                                            false);
1682        if (!error) {
1683                error = shmem_add_to_page_cache(page, mapping, index,
1684                                                swp_to_radix_entry(swap), gfp);
1685                /*
1686                 * We already confirmed swap under page lock, and make
1687                 * no memory allocation here, so usually no possibility
1688                 * of error; but free_swap_and_cache() only trylocks a
1689                 * page, so it is just possible that the entry has been
1690                 * truncated or holepunched since swap was confirmed.
1691                 * shmem_undo_range() will have done some of the
1692                 * unaccounting, now delete_from_swap_cache() will do
1693                 * the rest.
1694                 */
1695                if (error) {
1696                        mem_cgroup_cancel_charge(page, memcg, false);
1697                        delete_from_swap_cache(page);
1698                }
1699        }
1700        if (error)
1701                goto failed;
1702
1703        mem_cgroup_commit_charge(page, memcg, true, false);
1704
1705        spin_lock_irq(&info->lock);
1706        info->swapped--;
1707        shmem_recalc_inode(inode);
1708        spin_unlock_irq(&info->lock);
1709
1710        if (sgp == SGP_WRITE)
1711                mark_page_accessed(page);
1712
1713        delete_from_swap_cache(page);
1714        set_page_dirty(page);
1715        swap_free(swap);
1716
1717        *pagep = page;
1718        return 0;
1719failed:
1720        if (!shmem_confirm_swap(mapping, index, swap))
1721                error = -EEXIST;
1722unlock:
1723        if (page) {
1724                unlock_page(page);
1725                put_page(page);
1726        }
1727
1728        return error;
1729}
1730
1731/*
1732 * shmem_getpage_gfp - find page in cache, or get from swap, or allocate
1733 *
1734 * If we allocate a new one we do not mark it dirty. That's up to the
1735 * vm. If we swap it in we mark it dirty since we also free the swap
1736 * entry since a page cannot live in both the swap and page cache.
1737 *
1738 * vmf and fault_type are only supplied by shmem_fault:
1739 * otherwise they are NULL.
1740 */
1741static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
1742        struct page **pagep, enum sgp_type sgp, gfp_t gfp,
1743        struct vm_area_struct *vma, struct vm_fault *vmf,
1744                        vm_fault_t *fault_type)
1745{
1746        struct address_space *mapping = inode->i_mapping;
1747        struct shmem_inode_info *info = SHMEM_I(inode);
1748        struct shmem_sb_info *sbinfo;
1749        struct mm_struct *charge_mm;
1750        struct mem_cgroup *memcg;
1751        struct page *page;
1752        enum sgp_type sgp_huge = sgp;
1753        pgoff_t hindex = index;
1754        int error;
1755        int once = 0;
1756        int alloced = 0;
1757
1758        if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT))
1759                return -EFBIG;
1760        if (sgp == SGP_NOHUGE || sgp == SGP_HUGE)
1761                sgp = SGP_CACHE;
1762repeat:
1763        if (sgp <= SGP_CACHE &&
1764            ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
1765                return -EINVAL;
1766        }
1767
1768        sbinfo = SHMEM_SB(inode->i_sb);
1769        charge_mm = vma ? vma->vm_mm : current->mm;
1770
1771        page = find_lock_entry(mapping, index);
1772        if (xa_is_value(page)) {
1773                error = shmem_swapin_page(inode, index, &page,
1774                                          sgp, gfp, vma, fault_type);
1775                if (error == -EEXIST)
1776                        goto repeat;
1777
1778                *pagep = page;
1779                return error;
1780        }
1781
1782        if (page && sgp == SGP_WRITE)
1783                mark_page_accessed(page);
1784
1785        /* fallocated page? */
1786        if (page && !PageUptodate(page)) {
1787                if (sgp != SGP_READ)
1788                        goto clear;
1789                unlock_page(page);
1790                put_page(page);
1791                page = NULL;
1792        }
1793        if (page || sgp == SGP_READ) {
1794                *pagep = page;
1795                return 0;
1796        }
1797
1798        /*
1799         * Fast cache lookup did not find it:
1800         * bring it back from swap or allocate.
1801         */
1802
1803        if (vma && userfaultfd_missing(vma)) {
1804                *fault_type = handle_userfault(vmf, VM_UFFD_MISSING);
1805                return 0;
1806        }
1807
1808        /* shmem_symlink() */
1809        if (mapping->a_ops != &shmem_aops)
1810                goto alloc_nohuge;
1811        if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE)
1812                goto alloc_nohuge;
1813        if (shmem_huge == SHMEM_HUGE_FORCE)
1814                goto alloc_huge;
1815        switch (sbinfo->huge) {
1816                loff_t i_size;
1817                pgoff_t off;
1818        case SHMEM_HUGE_NEVER:
1819                goto alloc_nohuge;
1820        case SHMEM_HUGE_WITHIN_SIZE:
1821                off = round_up(index, HPAGE_PMD_NR);
1822                i_size = round_up(i_size_read(inode), PAGE_SIZE);
1823                if (i_size >= HPAGE_PMD_SIZE &&
1824                    i_size >> PAGE_SHIFT >= off)
1825                        goto alloc_huge;
1826                /* fallthrough */
1827        case SHMEM_HUGE_ADVISE:
1828                if (sgp_huge == SGP_HUGE)
1829                        goto alloc_huge;
1830                /* TODO: implement fadvise() hints */
1831                goto alloc_nohuge;
1832        }
1833
1834alloc_huge:
1835        page = shmem_alloc_and_acct_page(gfp, inode, index, true);
1836        if (IS_ERR(page)) {
1837alloc_nohuge:
1838                page = shmem_alloc_and_acct_page(gfp, inode,
1839                                                 index, false);
1840        }
1841        if (IS_ERR(page)) {
1842                int retry = 5;
1843
1844                error = PTR_ERR(page);
1845                page = NULL;
1846                if (error != -ENOSPC)
1847                        goto unlock;
1848                /*
1849                 * Try to reclaim some space by splitting a huge page
1850                 * beyond i_size on the filesystem.
1851                 */
1852                while (retry--) {
1853                        int ret;
1854
1855                        ret = shmem_unused_huge_shrink(sbinfo, NULL, 1);
1856                        if (ret == SHRINK_STOP)
1857                                break;
1858                        if (ret)
1859                                goto alloc_nohuge;
1860                }
1861                goto unlock;
1862        }
1863
1864        if (PageTransHuge(page))
1865                hindex = round_down(index, HPAGE_PMD_NR);
1866        else
1867                hindex = index;
1868
1869        if (sgp == SGP_WRITE)
1870                __SetPageReferenced(page);
1871
1872        error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
1873                                            PageTransHuge(page));
1874        if (error)
1875                goto unacct;
1876        error = shmem_add_to_page_cache(page, mapping, hindex,
1877                                        NULL, gfp & GFP_RECLAIM_MASK);
1878        if (error) {
1879                mem_cgroup_cancel_charge(page, memcg,
1880                                         PageTransHuge(page));
1881                goto unacct;
1882        }
1883        mem_cgroup_commit_charge(page, memcg, false,
1884                                 PageTransHuge(page));
1885        lru_cache_add_anon(page);
1886
1887        spin_lock_irq(&info->lock);
1888        info->alloced += compound_nr(page);
1889        inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page);
1890        shmem_recalc_inode(inode);
1891        spin_unlock_irq(&info->lock);
1892        alloced = true;
1893
1894        if (PageTransHuge(page) &&
1895            DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
1896                        hindex + HPAGE_PMD_NR - 1) {
1897                /*
1898                 * Part of the huge page is beyond i_size: subject
1899                 * to shrink under memory pressure.
1900                 */
1901                spin_lock(&sbinfo->shrinklist_lock);
1902                /*
1903                 * _careful to defend against unlocked access to
1904                 * ->shrink_list in shmem_unused_huge_shrink()
1905                 */
1906                if (list_empty_careful(&info->shrinklist)) {
1907                        list_add_tail(&info->shrinklist,
1908                                      &sbinfo->shrinklist);
1909                        sbinfo->shrinklist_len++;
1910                }
1911                spin_unlock(&sbinfo->shrinklist_lock);
1912        }
1913
1914        /*
1915         * Let SGP_FALLOC use the SGP_WRITE optimization on a new page.
1916         */
1917        if (sgp == SGP_FALLOC)
1918                sgp = SGP_WRITE;
1919clear:
1920        /*
1921         * Let SGP_WRITE caller clear ends if write does not fill page;
1922         * but SGP_FALLOC on a page fallocated earlier must initialize
1923         * it now, lest undo on failure cancel our earlier guarantee.
1924         */
1925        if (sgp != SGP_WRITE && !PageUptodate(page)) {
1926                struct page *head = compound_head(page);
1927                int i;
1928
1929                for (i = 0; i < compound_nr(head); i++) {
1930                        clear_highpage(head + i);
1931                        flush_dcache_page(head + i);
1932                }
1933                SetPageUptodate(head);
1934        }
1935
1936        /* Perhaps the file has been truncated since we checked */
1937        if (sgp <= SGP_CACHE &&
1938            ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
1939                if (alloced) {
1940                        ClearPageDirty(page);
1941                        delete_from_page_cache(page);
1942                        spin_lock_irq(&info->lock);
1943                        shmem_recalc_inode(inode);
1944                        spin_unlock_irq(&info->lock);
1945                }
1946                error = -EINVAL;
1947                goto unlock;
1948        }
1949        *pagep = page + index - hindex;
1950        return 0;
1951
1952        /*
1953         * Error recovery.
1954         */
1955unacct:
1956        shmem_inode_unacct_blocks(inode, compound_nr(page));
1957
1958        if (PageTransHuge(page)) {
1959                unlock_page(page);
1960                put_page(page);
1961                goto alloc_nohuge;
1962        }
1963unlock:
1964        if (page) {
1965                unlock_page(page);
1966                put_page(page);
1967        }
1968        if (error == -ENOSPC && !once++) {
1969                spin_lock_irq(&info->lock);
1970                shmem_recalc_inode(inode);
1971                spin_unlock_irq(&info->lock);
1972                goto repeat;
1973        }
1974        if (error == -EEXIST)
1975                goto repeat;
1976        return error;
1977}
1978
1979/*
1980 * This is like autoremove_wake_function, but it removes the wait queue
1981 * entry unconditionally - even if something else had already woken the
1982 * target.
1983 */
1984static int synchronous_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
1985{
1986        int ret = default_wake_function(wait, mode, sync, key);
1987        list_del_init(&wait->entry);
1988        return ret;
1989}
1990
1991static vm_fault_t shmem_fault(struct vm_fault *vmf)
1992{
1993        struct vm_area_struct *vma = vmf->vma;
1994        struct inode *inode = file_inode(vma->vm_file);
1995        gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
1996        enum sgp_type sgp;
1997        int err;
1998        vm_fault_t ret = VM_FAULT_LOCKED;
1999
2000        /*
2001         * Trinity finds that probing a hole which tmpfs is punching can
2002         * prevent the hole-punch from ever completing: which in turn
2003         * locks writers out with its hold on i_mutex.  So refrain from
2004         * faulting pages into the hole while it's being punched.  Although
2005         * shmem_undo_range() does remove the additions, it may be unable to
2006         * keep up, as each new page needs its own unmap_mapping_range() call,
2007         * and the i_mmap tree grows ever slower to scan if new vmas are added.
2008         *
2009         * It does not matter if we sometimes reach this check just before the
2010         * hole-punch begins, so that one fault then races with the punch:
2011         * we just need to make racing faults a rare case.
2012         *
2013         * The implementation below would be much simpler if we just used a
2014         * standard mutex or completion: but we cannot take i_mutex in fault,
2015         * and bloating every shmem inode for this unlikely case would be sad.
2016         */
2017        if (unlikely(inode->i_private)) {
2018                struct shmem_falloc *shmem_falloc;
2019
2020                spin_lock(&inode->i_lock);
2021                shmem_falloc = inode->i_private;
2022                if (shmem_falloc &&
2023                    shmem_falloc->waitq &&
2024                    vmf->pgoff >= shmem_falloc->start &&
2025                    vmf->pgoff < shmem_falloc->next) {
2026                        struct file *fpin;
2027                        wait_queue_head_t *shmem_falloc_waitq;
2028                        DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function);
2029
2030                        ret = VM_FAULT_NOPAGE;
2031                        fpin = maybe_unlock_mmap_for_io(vmf, NULL);
2032                        if (fpin)
2033                                ret = VM_FAULT_RETRY;
2034
2035                        shmem_falloc_waitq = shmem_falloc->waitq;
2036                        prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
2037                                        TASK_UNINTERRUPTIBLE);
2038                        spin_unlock(&inode->i_lock);
2039                        schedule();
2040
2041                        /*
2042                         * shmem_falloc_waitq points into the shmem_fallocate()
2043                         * stack of the hole-punching task: shmem_falloc_waitq
2044                         * is usually invalid by the time we reach here, but
2045                         * finish_wait() does not dereference it in that case;
2046                         * though i_lock needed lest racing with wake_up_all().
2047                         */
2048                        spin_lock(&inode->i_lock);
2049                        finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
2050                        spin_unlock(&inode->i_lock);
2051
2052                        if (fpin)
2053                                fput(fpin);
2054                        return ret;
2055                }
2056                spin_unlock(&inode->i_lock);
2057        }
2058
2059        sgp = SGP_CACHE;
2060
2061        if ((vma->vm_flags & VM_NOHUGEPAGE) ||
2062            test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
2063                sgp = SGP_NOHUGE;
2064        else if (vma->vm_flags & VM_HUGEPAGE)
2065                sgp = SGP_HUGE;
2066
2067        err = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp,
2068                                  gfp, vma, vmf, &ret);
2069        if (err)
2070                return vmf_error(err);
2071        return ret;
2072}
2073
2074unsigned long shmem_get_unmapped_area(struct file *file,
2075                                      unsigned long uaddr, unsigned long len,
2076                                      unsigned long pgoff, unsigned long flags)
2077{
2078        unsigned long (*get_area)(struct file *,
2079                unsigned long, unsigned long, unsigned long, unsigned long);
2080        unsigned long addr;
2081        unsigned long offset;
2082        unsigned long inflated_len;
2083        unsigned long inflated_addr;
2084        unsigned long inflated_offset;
2085
2086        if (len > TASK_SIZE)
2087                return -ENOMEM;
2088
2089        get_area = current->mm->get_unmapped_area;
2090        addr = get_area(file, uaddr, len, pgoff, flags);
2091
2092        if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
2093                return addr;
2094        if (IS_ERR_VALUE(addr))
2095                return addr;
2096        if (addr & ~PAGE_MASK)
2097                return addr;
2098        if (addr > TASK_SIZE - len)
2099                return addr;
2100
2101        if (shmem_huge == SHMEM_HUGE_DENY)
2102                return addr;
2103        if (len < HPAGE_PMD_SIZE)
2104                return addr;
2105        if (flags & MAP_FIXED)
2106                return addr;
2107        /*
2108         * Our priority is to support MAP_SHARED mapped hugely;
2109         * and support MAP_PRIVATE mapped hugely too, until it is COWed.
2110         * But if caller specified an address hint and we allocated area there
2111         * successfully, respect that as before.
2112         */
2113        if (uaddr == addr)
2114                return addr;
2115
2116        if (shmem_huge != SHMEM_HUGE_FORCE) {
2117                struct super_block *sb;
2118
2119                if (file) {
2120                        VM_BUG_ON(file->f_op != &shmem_file_operations);
2121                        sb = file_inode(file)->i_sb;
2122                } else {
2123                        /*
2124                         * Called directly from mm/mmap.c, or drivers/char/mem.c
2125                         * for "/dev/zero", to create a shared anonymous object.
2126                         */
2127                        if (IS_ERR(shm_mnt))
2128                                return addr;
2129                        sb = shm_mnt->mnt_sb;
2130                }
2131                if (SHMEM_SB(sb)->huge == SHMEM_HUGE_NEVER)
2132                        return addr;
2133        }
2134
2135        offset = (pgoff << PAGE_SHIFT) & (HPAGE_PMD_SIZE-1);
2136        if (offset && offset + len < 2 * HPAGE_PMD_SIZE)
2137                return addr;
2138        if ((addr & (HPAGE_PMD_SIZE-1)) == offset)
2139                return addr;
2140
2141        inflated_len = len + HPAGE_PMD_SIZE - PAGE_SIZE;
2142        if (inflated_len > TASK_SIZE)
2143                return addr;
2144        if (inflated_len < len)
2145                return addr;
2146
2147        inflated_addr = get_area(NULL, uaddr, inflated_len, 0, flags);
2148        if (IS_ERR_VALUE(inflated_addr))
2149                return addr;
2150        if (inflated_addr & ~PAGE_MASK)
2151                return addr;
2152
2153        inflated_offset = inflated_addr & (HPAGE_PMD_SIZE-1);
2154        inflated_addr += offset - inflated_offset;
2155        if (inflated_offset > offset)
2156                inflated_addr += HPAGE_PMD_SIZE;
2157
2158        if (inflated_addr > TASK_SIZE - len)
2159                return addr;
2160        return inflated_addr;
2161}
2162
2163#ifdef CONFIG_NUMA
2164static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
2165{
2166        struct inode *inode = file_inode(vma->vm_file);
2167        return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol);
2168}
2169
2170static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
2171                                          unsigned long addr)
2172{
2173        struct inode *inode = file_inode(vma->vm_file);
2174        pgoff_t index;
2175
2176        index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2177        return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index);
2178}
2179#endif
2180
2181int shmem_lock(struct file *file, int lock, struct user_struct *user)
2182{
2183        struct inode *inode = file_inode(file);
2184        struct shmem_inode_info *info = SHMEM_I(inode);
2185        int retval = -ENOMEM;
2186
2187        spin_lock_irq(&info->lock);
2188        if (lock && !(info->flags & VM_LOCKED)) {
2189                if (!user_shm_lock(inode->i_size, user))
2190                        goto out_nomem;
2191                info->flags |= VM_LOCKED;
2192                mapping_set_unevictable(file->f_mapping);
2193        }
2194        if (!lock && (info->flags & VM_LOCKED) && user) {
2195                user_shm_unlock(inode->i_size, user);
2196                info->flags &= ~VM_LOCKED;
2197                mapping_clear_unevictable(file->f_mapping);
2198        }
2199        retval = 0;
2200
2201out_nomem:
2202        spin_unlock_irq(&info->lock);
2203        return retval;
2204}
2205
2206static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
2207{
2208        struct shmem_inode_info *info = SHMEM_I(file_inode(file));
2209
2210        if (info->seals & F_SEAL_FUTURE_WRITE) {
2211                /*
2212                 * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
2213                 * "future write" seal active.
2214                 */
2215                if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
2216                        return -EPERM;
2217
2218                /*
2219                 * Since an F_SEAL_FUTURE_WRITE sealed memfd can be mapped as
2220                 * MAP_SHARED and read-only, take care to not allow mprotect to
2221                 * revert protections on such mappings. Do this only for shared
2222                 * mappings. For private mappings, don't need to mask
2223                 * VM_MAYWRITE as we still want them to be COW-writable.
2224                 */
2225                if (vma->vm_flags & VM_SHARED)
2226                        vma->vm_flags &= ~(VM_MAYWRITE);
2227        }
2228
2229        file_accessed(file);
2230        vma->vm_ops = &shmem_vm_ops;
2231        if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
2232                        ((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) <
2233                        (vma->vm_end & HPAGE_PMD_MASK)) {
2234                khugepaged_enter(vma, vma->vm_flags);
2235        }
2236        return 0;
2237}
2238
2239static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir,
2240                                     umode_t mode, dev_t dev, unsigned long flags)
2241{
2242        struct inode *inode;
2243        struct shmem_inode_info *info;
2244        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
2245
2246        if (shmem_reserve_inode(sb))
2247                return NULL;
2248
2249        inode = new_inode(sb);
2250        if (inode) {
2251                inode->i_ino = get_next_ino();
2252                inode_init_owner(inode, dir, mode);
2253                inode->i_blocks = 0;
2254                inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
2255                inode->i_generation = prandom_u32();
2256                info = SHMEM_I(inode);
2257                memset(info, 0, (char *)inode - (char *)info);
2258                spin_lock_init(&info->lock);
2259                atomic_set(&info->stop_eviction, 0);
2260                info->seals = F_SEAL_SEAL;
2261                info->flags = flags & VM_NORESERVE;
2262                INIT_LIST_HEAD(&info->shrinklist);
2263                INIT_LIST_HEAD(&info->swaplist);
2264                simple_xattrs_init(&info->xattrs);
2265                cache_no_acl(inode);
2266
2267                switch (mode & S_IFMT) {
2268                default:
2269                        inode->i_op = &shmem_special_inode_operations;
2270                        init_special_inode(inode, mode, dev);
2271                        break;
2272                case S_IFREG:
2273                        inode->i_mapping->a_ops = &shmem_aops;
2274                        inode->i_op = &shmem_inode_operations;
2275                        inode->i_fop = &shmem_file_operations;
2276                        mpol_shared_policy_init(&info->policy,
2277                                                 shmem_get_sbmpol(sbinfo));
2278                        break;
2279                case S_IFDIR:
2280                        inc_nlink(inode);
2281                        /* Some things misbehave if size == 0 on a directory */
2282                        inode->i_size = 2 * BOGO_DIRENT_SIZE;
2283                        inode->i_op = &shmem_dir_inode_operations;
2284                        inode->i_fop = &simple_dir_operations;
2285                        break;
2286                case S_IFLNK:
2287                        /*
2288                         * Must not load anything in the rbtree,
2289                         * mpol_free_shared_policy will not be called.
2290                         */
2291                        mpol_shared_policy_init(&info->policy, NULL);
2292                        break;
2293                }
2294
2295                lockdep_annotate_inode_mutex_key(inode);
2296        } else
2297                shmem_free_inode(sb);
2298        return inode;
2299}
2300
2301bool shmem_mapping(struct address_space *mapping)
2302{
2303        return mapping->a_ops == &shmem_aops;
2304}
2305
2306static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
2307                                  pmd_t *dst_pmd,
2308                                  struct vm_area_struct *dst_vma,
2309                                  unsigned long dst_addr,
2310                                  unsigned long src_addr,
2311                                  bool zeropage,
2312                                  struct page **pagep)
2313{
2314        struct inode *inode = file_inode(dst_vma->vm_file);
2315        struct shmem_inode_info *info = SHMEM_I(inode);
2316        struct address_space *mapping = inode->i_mapping;
2317        gfp_t gfp = mapping_gfp_mask(mapping);
2318        pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
2319        struct mem_cgroup *memcg;
2320        spinlock_t *ptl;
2321        void *page_kaddr;
2322        struct page *page;
2323        pte_t _dst_pte, *dst_pte;
2324        int ret;
2325        pgoff_t offset, max_off;
2326
2327        ret = -ENOMEM;
2328        if (!shmem_inode_acct_block(inode, 1))
2329                goto out;
2330
2331        if (!*pagep) {
2332                page = shmem_alloc_page(gfp, info, pgoff);
2333                if (!page)
2334                        goto out_unacct_blocks;
2335
2336                if (!zeropage) {        /* mcopy_atomic */
2337                        page_kaddr = kmap_atomic(page);
2338                        ret = copy_from_user(page_kaddr,
2339                                             (const void __user *)src_addr,
2340                                             PAGE_SIZE);
2341                        kunmap_atomic(page_kaddr);
2342
2343                        /* fallback to copy_from_user outside mmap_sem */
2344                        if (unlikely(ret)) {
2345                                *pagep = page;
2346                                shmem_inode_unacct_blocks(inode, 1);
2347                                /* don't free the page */
2348                                return -ENOENT;
2349                        }
2350                } else {                /* mfill_zeropage_atomic */
2351                        clear_highpage(page);
2352                }
2353        } else {
2354                page = *pagep;
2355                *pagep = NULL;
2356        }
2357
2358        VM_BUG_ON(PageLocked(page) || PageSwapBacked(page));
2359        __SetPageLocked(page);
2360        __SetPageSwapBacked(page);
2361        __SetPageUptodate(page);
2362
2363        ret = -EFAULT;
2364        offset = linear_page_index(dst_vma, dst_addr);
2365        max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
2366        if (unlikely(offset >= max_off))
2367                goto out_release;
2368
2369        ret = mem_cgroup_try_charge_delay(page, dst_mm, gfp, &memcg, false);
2370        if (ret)
2371                goto out_release;
2372
2373        ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL,
2374                                                gfp & GFP_RECLAIM_MASK);
2375        if (ret)
2376                goto out_release_uncharge;
2377
2378        mem_cgroup_commit_charge(page, memcg, false, false);
2379
2380        _dst_pte = mk_pte(page, dst_vma->vm_page_prot);
2381        if (dst_vma->vm_flags & VM_WRITE)
2382                _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));
2383        else {
2384                /*
2385                 * We don't set the pte dirty if the vma has no
2386                 * VM_WRITE permission, so mark the page dirty or it
2387                 * could be freed from under us. We could do it
2388                 * unconditionally before unlock_page(), but doing it
2389                 * only if VM_WRITE is not set is faster.
2390                 */
2391                set_page_dirty(page);
2392        }
2393
2394        dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
2395
2396        ret = -EFAULT;
2397        max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
2398        if (unlikely(offset >= max_off))
2399                goto out_release_uncharge_unlock;
2400
2401        ret = -EEXIST;
2402        if (!pte_none(*dst_pte))
2403                goto out_release_uncharge_unlock;
2404
2405        lru_cache_add_anon(page);
2406
2407        spin_lock(&info->lock);
2408        info->alloced++;
2409        inode->i_blocks += BLOCKS_PER_PAGE;
2410        shmem_recalc_inode(inode);
2411        spin_unlock(&info->lock);
2412
2413        inc_mm_counter(dst_mm, mm_counter_file(page));
2414        page_add_file_rmap(page, false);
2415        set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
2416
2417        /* No need to invalidate - it was non-present before */
2418        update_mmu_cache(dst_vma, dst_addr, dst_pte);
2419        pte_unmap_unlock(dst_pte, ptl);
2420        unlock_page(page);
2421        ret = 0;
2422out:
2423        return ret;
2424out_release_uncharge_unlock:
2425        pte_unmap_unlock(dst_pte, ptl);
2426        ClearPageDirty(page);
2427        delete_from_page_cache(page);
2428out_release_uncharge:
2429        mem_cgroup_cancel_charge(page, memcg, false);
2430out_release:
2431        unlock_page(page);
2432        put_page(page);
2433out_unacct_blocks:
2434        shmem_inode_unacct_blocks(inode, 1);
2435        goto out;
2436}
2437
2438int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
2439                           pmd_t *dst_pmd,
2440                           struct vm_area_struct *dst_vma,
2441                           unsigned long dst_addr,
2442                           unsigned long src_addr,
2443                           struct page **pagep)
2444{
2445        return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
2446                                      dst_addr, src_addr, false, pagep);
2447}
2448
2449int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm,
2450                             pmd_t *dst_pmd,
2451                             struct vm_area_struct *dst_vma,
2452                             unsigned long dst_addr)
2453{
2454        struct page *page = NULL;
2455
2456        return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
2457                                      dst_addr, 0, true, &page);
2458}
2459
2460#ifdef CONFIG_TMPFS
2461static const struct inode_operations shmem_symlink_inode_operations;
2462static const struct inode_operations shmem_short_symlink_operations;
2463
2464#ifdef CONFIG_TMPFS_XATTR
2465static int shmem_initxattrs(struct inode *, const struct xattr *, void *);
2466#else
2467#define shmem_initxattrs NULL
2468#endif
2469
2470static int
2471shmem_write_begin(struct file *file, struct address_space *mapping,
2472                        loff_t pos, unsigned len, unsigned flags,
2473                        struct page **pagep, void **fsdata)
2474{
2475        struct inode *inode = mapping->host;
2476        struct shmem_inode_info *info = SHMEM_I(inode);
2477        pgoff_t index = pos >> PAGE_SHIFT;
2478
2479        /* i_mutex is held by caller */
2480        if (unlikely(info->seals & (F_SEAL_GROW |
2481                                   F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) {
2482                if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))
2483                        return -EPERM;
2484                if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size)
2485                        return -EPERM;
2486        }
2487
2488        return shmem_getpage(inode, index, pagep, SGP_WRITE);
2489}
2490
2491static int
2492shmem_write_end(struct file *file, struct address_space *mapping,
2493                        loff_t pos, unsigned len, unsigned copied,
2494                        struct page *page, void *fsdata)
2495{
2496        struct inode *inode = mapping->host;
2497
2498        if (pos + copied > inode->i_size)
2499                i_size_write(inode, pos + copied);
2500
2501        if (!PageUptodate(page)) {
2502                struct page *head = compound_head(page);
2503                if (PageTransCompound(page)) {
2504                        int i;
2505
2506                        for (i = 0; i < HPAGE_PMD_NR; i++) {
2507                                if (head + i == page)
2508                                        continue;
2509                                clear_highpage(head + i);
2510                                flush_dcache_page(head + i);
2511                        }
2512                }
2513                if (copied < PAGE_SIZE) {
2514                        unsigned from = pos & (PAGE_SIZE - 1);
2515                        zero_user_segments(page, 0, from,
2516                                        from + copied, PAGE_SIZE);
2517                }
2518                SetPageUptodate(head);
2519        }
2520        set_page_dirty(page);
2521        unlock_page(page);
2522        put_page(page);
2523
2524        return copied;
2525}
2526
2527static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
2528{
2529        struct file *file = iocb->ki_filp;
2530        struct inode *inode = file_inode(file);
2531        struct address_space *mapping = inode->i_mapping;
2532        pgoff_t index;
2533        unsigned long offset;
2534        enum sgp_type sgp = SGP_READ;
2535        int error = 0;
2536        ssize_t retval = 0;
2537        loff_t *ppos = &iocb->ki_pos;
2538
2539        /*
2540         * Might this read be for a stacking filesystem?  Then when reading
2541         * holes of a sparse file, we actually need to allocate those pages,
2542         * and even mark them dirty, so it cannot exceed the max_blocks limit.
2543         */
2544        if (!iter_is_iovec(to))
2545                sgp = SGP_CACHE;
2546
2547        index = *ppos >> PAGE_SHIFT;
2548        offset = *ppos & ~PAGE_MASK;
2549
2550        for (;;) {
2551                struct page *page = NULL;
2552                pgoff_t end_index;
2553                unsigned long nr, ret;
2554                loff_t i_size = i_size_read(inode);
2555
2556                end_index = i_size >> PAGE_SHIFT;
2557                if (index > end_index)
2558                        break;
2559                if (index == end_index) {
2560                        nr = i_size & ~PAGE_MASK;
2561                        if (nr <= offset)
2562                                break;
2563                }
2564
2565                error = shmem_getpage(inode, index, &page, sgp);
2566                if (error) {
2567                        if (error == -EINVAL)
2568                                error = 0;
2569                        break;
2570                }
2571                if (page) {
2572                        if (sgp == SGP_CACHE)
2573                                set_page_dirty(page);
2574                        unlock_page(page);
2575                }
2576
2577                /*
2578                 * We must evaluate after, since reads (unlike writes)
2579                 * are called without i_mutex protection against truncate
2580                 */
2581                nr = PAGE_SIZE;
2582                i_size = i_size_read(inode);
2583                end_index = i_size >> PAGE_SHIFT;
2584                if (index == end_index) {
2585                        nr = i_size & ~PAGE_MASK;
2586                        if (nr <= offset) {
2587                                if (page)
2588                                        put_page(page);
2589                                break;
2590                        }
2591                }
2592                nr -= offset;
2593
2594                if (page) {
2595                        /*
2596                         * If users can be writing to this page using arbitrary
2597                         * virtual addresses, take care about potential aliasing
2598                         * before reading the page on the kernel side.
2599                         */
2600                        if (mapping_writably_mapped(mapping))
2601                                flush_dcache_page(page);
2602                        /*
2603                         * Mark the page accessed if we read the beginning.
2604                         */
2605                        if (!offset)
2606                                mark_page_accessed(page);
2607                } else {
2608                        page = ZERO_PAGE(0);
2609                        get_page(page);
2610                }
2611
2612                /*
2613                 * Ok, we have the page, and it's up-to-date, so
2614                 * now we can copy it to user space...
2615                 */
2616                ret = copy_page_to_iter(page, offset, nr, to);
2617                retval += ret;
2618                offset += ret;
2619                index += offset >> PAGE_SHIFT;
2620                offset &= ~PAGE_MASK;
2621
2622                put_page(page);
2623                if (!iov_iter_count(to))
2624                        break;
2625                if (ret < nr) {
2626                        error = -EFAULT;
2627                        break;
2628                }
2629                cond_resched();
2630        }
2631
2632        *ppos = ((loff_t) index << PAGE_SHIFT) + offset;
2633        file_accessed(file);
2634        return retval ? retval : error;
2635}
2636
2637/*
2638 * llseek SEEK_DATA or SEEK_HOLE through the page cache.
2639 */
2640static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
2641                                    pgoff_t index, pgoff_t end, int whence)
2642{
2643        struct page *page;
2644        struct pagevec pvec;
2645        pgoff_t indices[PAGEVEC_SIZE];
2646        bool done = false;
2647        int i;
2648
2649        pagevec_init(&pvec);
2650        pvec.nr = 1;            /* start small: we may be there already */
2651        while (!done) {
2652                pvec.nr = find_get_entries(mapping, index,
2653                                        pvec.nr, pvec.pages, indices);
2654                if (!pvec.nr) {
2655                        if (whence == SEEK_DATA)
2656                                index = end;
2657                        break;
2658                }
2659                for (i = 0; i < pvec.nr; i++, index++) {
2660                        if (index < indices[i]) {
2661                                if (whence == SEEK_HOLE) {
2662                                        done = true;
2663                                        break;
2664                                }
2665                                index = indices[i];
2666                        }
2667                        page = pvec.pages[i];
2668                        if (page && !xa_is_value(page)) {
2669                                if (!PageUptodate(page))
2670                                        page = NULL;
2671                        }
2672                        if (index >= end ||
2673                            (page && whence == SEEK_DATA) ||
2674                            (!page && whence == SEEK_HOLE)) {
2675                                done = true;
2676                                break;
2677                        }
2678                }
2679                pagevec_remove_exceptionals(&pvec);
2680                pagevec_release(&pvec);
2681                pvec.nr = PAGEVEC_SIZE;
2682                cond_resched();
2683        }
2684        return index;
2685}
2686
2687static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
2688{
2689        struct address_space *mapping = file->f_mapping;
2690        struct inode *inode = mapping->host;
2691        pgoff_t start, end;
2692        loff_t new_offset;
2693
2694        if (whence != SEEK_DATA && whence != SEEK_HOLE)
2695                return generic_file_llseek_size(file, offset, whence,
2696                                        MAX_LFS_FILESIZE, i_size_read(inode));
2697        inode_lock(inode);
2698        /* We're holding i_mutex so we can access i_size directly */
2699
2700        if (offset < 0 || offset >= inode->i_size)
2701                offset = -ENXIO;
2702        else {
2703                start = offset >> PAGE_SHIFT;
2704                end = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
2705                new_offset = shmem_seek_hole_data(mapping, start, end, whence);
2706                new_offset <<= PAGE_SHIFT;
2707                if (new_offset > offset) {
2708                        if (new_offset < inode->i_size)
2709                                offset = new_offset;
2710                        else if (whence == SEEK_DATA)
2711                                offset = -ENXIO;
2712                        else
2713                                offset = inode->i_size;
2714                }
2715        }
2716
2717        if (offset >= 0)
2718                offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE);
2719        inode_unlock(inode);
2720        return offset;
2721}
2722
2723static long shmem_fallocate(struct file *file, int mode, loff_t offset,
2724                                                         loff_t len)
2725{
2726        struct inode *inode = file_inode(file);
2727        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
2728        struct shmem_inode_info *info = SHMEM_I(inode);
2729        struct shmem_falloc shmem_falloc;
2730        pgoff_t start, index, end;
2731        int error;
2732
2733        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
2734                return -EOPNOTSUPP;
2735
2736        inode_lock(inode);
2737
2738        if (mode & FALLOC_FL_PUNCH_HOLE) {
2739                struct address_space *mapping = file->f_mapping;
2740                loff_t unmap_start = round_up(offset, PAGE_SIZE);
2741                loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
2742                DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq);
2743
2744                /* protected by i_mutex */
2745                if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
2746                        error = -EPERM;
2747                        goto out;
2748                }
2749
2750                shmem_falloc.waitq = &shmem_falloc_waitq;
2751                shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT;
2752                shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
2753                spin_lock(&inode->i_lock);
2754                inode->i_private = &shmem_falloc;
2755                spin_unlock(&inode->i_lock);
2756
2757                if ((u64)unmap_end > (u64)unmap_start)
2758                        unmap_mapping_range(mapping, unmap_start,
2759                                            1 + unmap_end - unmap_start, 0);
2760                shmem_truncate_range(inode, offset, offset + len - 1);
2761                /* No need to unmap again: hole-punching leaves COWed pages */
2762
2763                spin_lock(&inode->i_lock);
2764                inode->i_private = NULL;
2765                wake_up_all(&shmem_falloc_waitq);
2766                WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.head));
2767                spin_unlock(&inode->i_lock);
2768                error = 0;
2769                goto out;
2770        }
2771
2772        /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
2773        error = inode_newsize_ok(inode, offset + len);
2774        if (error)
2775                goto out;
2776
2777        if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) {
2778                error = -EPERM;
2779                goto out;
2780        }
2781
2782        start = offset >> PAGE_SHIFT;
2783        end = (offset + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
2784        /* Try to avoid a swapstorm if len is impossible to satisfy */
2785        if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) {
2786                error = -ENOSPC;
2787                goto out;
2788        }
2789
2790        shmem_falloc.waitq = NULL;
2791        shmem_falloc.start = start;
2792        shmem_falloc.next  = start;
2793        shmem_falloc.nr_falloced = 0;
2794        shmem_falloc.nr_unswapped = 0;
2795        spin_lock(&inode->i_lock);
2796        inode->i_private = &shmem_falloc;
2797        spin_unlock(&inode->i_lock);
2798
2799        for (index = start; index < end; index++) {
2800                struct page *page;
2801
2802                /*
2803                 * Good, the fallocate(2) manpage permits EINTR: we may have
2804                 * been interrupted because we are using up too much memory.
2805                 */
2806                if (signal_pending(current))
2807                        error = -EINTR;
2808                else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
2809                        error = -ENOMEM;
2810                else
2811                        error = shmem_getpage(inode, index, &page, SGP_FALLOC);
2812                if (error) {
2813                        /* Remove the !PageUptodate pages we added */
2814                        if (index > start) {
2815                                shmem_undo_range(inode,
2816                                    (loff_t)start << PAGE_SHIFT,
2817                                    ((loff_t)index << PAGE_SHIFT) - 1, true);
2818                        }
2819                        goto undone;
2820                }
2821
2822                /*
2823                 * Inform shmem_writepage() how far we have reached.
2824                 * No need for lock or barrier: we have the page lock.
2825                 */
2826                shmem_falloc.next++;
2827                if (!PageUptodate(page))
2828                        shmem_falloc.nr_falloced++;
2829
2830                /*
2831                 * If !PageUptodate, leave it that way so that freeable pages
2832                 * can be recognized if we need to rollback on error later.
2833                 * But set_page_dirty so that memory pressure will swap rather
2834                 * than free the pages we are allocating (and SGP_CACHE pages
2835                 * might still be clean: we now need to mark those dirty too).
2836                 */
2837                set_page_dirty(page);
2838                unlock_page(page);
2839                put_page(page);
2840                cond_resched();
2841        }
2842
2843        if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
2844                i_size_write(inode, offset + len);
2845        inode->i_ctime = current_time(inode);
2846undone:
2847        spin_lock(&inode->i_lock);
2848        inode->i_private = NULL;
2849        spin_unlock(&inode->i_lock);
2850out:
2851        inode_unlock(inode);
2852        return error;
2853}
2854
2855static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
2856{
2857        struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
2858
2859        buf->f_type = TMPFS_MAGIC;
2860        buf->f_bsize = PAGE_SIZE;
2861        buf->f_namelen = NAME_MAX;
2862        if (sbinfo->max_blocks) {
2863                buf->f_blocks = sbinfo->max_blocks;
2864                buf->f_bavail =
2865                buf->f_bfree  = sbinfo->max_blocks -
2866                                percpu_counter_sum(&sbinfo->used_blocks);
2867        }
2868        if (sbinfo->max_inodes) {
2869                buf->f_files = sbinfo->max_inodes;
2870                buf->f_ffree = sbinfo->free_inodes;
2871        }
2872        /* else leave those fields 0 like simple_statfs */
2873        return 0;
2874}
2875
2876/*
2877 * File creation. Allocate an inode, and we're done..
2878 */
2879static int
2880shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
2881{
2882        struct inode *inode;
2883        int error = -ENOSPC;
2884
2885        inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
2886        if (inode) {
2887                error = simple_acl_create(dir, inode);
2888                if (error)
2889                        goto out_iput;
2890                error = security_inode_init_security(inode, dir,
2891                                                     &dentry->d_name,
2892                                                     shmem_initxattrs, NULL);
2893                if (error && error != -EOPNOTSUPP)
2894                        goto out_iput;
2895
2896                error = 0;
2897                dir->i_size += BOGO_DIRENT_SIZE;
2898                dir->i_ctime = dir->i_mtime = current_time(dir);
2899                d_instantiate(dentry, inode);
2900                dget(dentry); /* Extra count - pin the dentry in core */
2901        }
2902        return error;
2903out_iput:
2904        iput(inode);
2905        return error;
2906}
2907
2908static int
2909shmem_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
2910{
2911        struct inode *inode;
2912        int error = -ENOSPC;
2913
2914        inode = shmem_get_inode(dir->i_sb, dir, mode, 0, VM_NORESERVE);
2915        if (inode) {
2916                error = security_inode_init_security(inode, dir,
2917                                                     NULL,
2918                                                     shmem_initxattrs, NULL);
2919                if (error && error != -EOPNOTSUPP)
2920                        goto out_iput;
2921                error = simple_acl_create(dir, inode);
2922                if (error)
2923                        goto out_iput;
2924                d_tmpfile(dentry, inode);
2925        }
2926        return error;
2927out_iput:
2928        iput(inode);
2929        return error;
2930}
2931
2932static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
2933{
2934        int error;
2935
2936        if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0)))
2937                return error;
2938        inc_nlink(dir);
2939        return 0;
2940}
2941
2942static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode,
2943                bool excl)
2944{
2945        return shmem_mknod(dir, dentry, mode | S_IFREG, 0);
2946}
2947
2948/*
2949 * Link a file..
2950 */
2951static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
2952{
2953        struct inode *inode = d_inode(old_dentry);
2954        int ret = 0;
2955
2956        /*
2957         * No ordinary (disk based) filesystem counts links as inodes;
2958         * but each new link needs a new dentry, pinning lowmem, and
2959         * tmpfs dentries cannot be pruned until they are unlinked.
2960         * But if an O_TMPFILE file is linked into the tmpfs, the
2961         * first link must skip that, to get the accounting right.
2962         */
2963        if (inode->i_nlink) {
2964                ret = shmem_reserve_inode(inode->i_sb);
2965                if (ret)
2966                        goto out;
2967        }
2968
2969        dir->i_size += BOGO_DIRENT_SIZE;
2970        inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
2971        inc_nlink(inode);
2972        ihold(inode);   /* New dentry reference */
2973        dget(dentry);           /* Extra pinning count for the created dentry */
2974        d_instantiate(dentry, inode);
2975out:
2976        return ret;
2977}
2978
2979static int shmem_unlink(struct inode *dir, struct dentry *dentry)
2980{
2981        struct inode *inode = d_inode(dentry);
2982
2983        if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
2984                shmem_free_inode(inode->i_sb);
2985
2986        dir->i_size -= BOGO_DIRENT_SIZE;
2987        inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
2988        drop_nlink(inode);
2989        dput(dentry);   /* Undo the count from "create" - this does all the work */
2990        return 0;
2991}
2992
2993static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
2994{
2995        if (!simple_empty(dentry))
2996                return -ENOTEMPTY;
2997
2998        drop_nlink(d_inode(dentry));
2999        drop_nlink(dir);
3000        return shmem_unlink(dir, dentry);
3001}
3002
3003static int shmem_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry)
3004{
3005        bool old_is_dir = d_is_dir(old_dentry);
3006        bool new_is_dir = d_is_dir(new_dentry);
3007
3008        if (old_dir != new_dir && old_is_dir != new_is_dir) {
3009                if (old_is_dir) {
3010                        drop_nlink(old_dir);
3011                        inc_nlink(new_dir);
3012                } else {
3013                        drop_nlink(new_dir);
3014                        inc_nlink(old_dir);
3015                }
3016        }
3017        old_dir->i_ctime = old_dir->i_mtime =
3018        new_dir->i_ctime = new_dir->i_mtime =
3019        d_inode(old_dentry)->i_ctime =
3020        d_inode(new_dentry)->i_ctime = current_time(old_dir);
3021
3022        return 0;
3023}
3024
3025static int shmem_whiteout(struct inode *old_dir, struct dentry *old_dentry)
3026{
3027        struct dentry *whiteout;
3028        int error;
3029
3030        whiteout = d_alloc(old_dentry->d_parent, &old_dentry->d_name);
3031        if (!whiteout)
3032                return -ENOMEM;
3033
3034        error = shmem_mknod(old_dir, whiteout,
3035                            S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
3036        dput(whiteout);
3037        if (error)
3038                return error;
3039
3040        /*
3041         * Cheat and hash the whiteout while the old dentry is still in
3042         * place, instead of playing games with FS_RENAME_DOES_D_MOVE.
3043         *
3044         * d_lookup() will consistently find one of them at this point,
3045         * not sure which one, but that isn't even important.
3046         */
3047        d_rehash(whiteout);
3048        return 0;
3049}
3050
3051/*
3052 * The VFS layer already does all the dentry stuff for rename,
3053 * we just have to decrement the usage count for the target if
3054 * it exists so that the VFS layer correctly free's it when it
3055 * gets overwritten.
3056 */
3057static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags)
3058{
3059        struct inode *inode = d_inode(old_dentry);
3060        int they_are_dirs = S_ISDIR(inode->i_mode);
3061
3062        if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
3063                return -EINVAL;
3064
3065        if (flags & RENAME_EXCHANGE)
3066                return shmem_exchange(old_dir, old_dentry, new_dir, new_dentry);
3067
3068        if (!simple_empty(new_dentry))
3069                return -ENOTEMPTY;
3070
3071        if (flags & RENAME_WHITEOUT) {
3072                int error;
3073
3074                error = shmem_whiteout(old_dir, old_dentry);
3075                if (error)
3076                        return error;
3077        }
3078
3079        if (d_really_is_positive(new_dentry)) {
3080                (void) shmem_unlink(new_dir, new_dentry);
3081                if (they_are_dirs) {
3082                        drop_nlink(d_inode(new_dentry));
3083                        drop_nlink(old_dir);
3084                }
3085        } else if (they_are_dirs) {
3086                drop_nlink(old_dir);
3087                inc_nlink(new_dir);
3088        }
3089
3090        old_dir->i_size -= BOGO_DIRENT_SIZE;
3091        new_dir->i_size += BOGO_DIRENT_SIZE;
3092        old_dir->i_ctime = old_dir->i_mtime =
3093        new_dir->i_ctime = new_dir->i_mtime =
3094        inode->i_ctime = current_time(old_dir);
3095        return 0;
3096}
3097
3098static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
3099{
3100        int error;
3101        int len;
3102        struct inode *inode;
3103        struct page *page;
3104
3105        len = strlen(symname) + 1;
3106        if (len > PAGE_SIZE)
3107                return -ENAMETOOLONG;
3108
3109        inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK | 0777, 0,
3110                                VM_NORESERVE);
3111        if (!inode)
3112                return -ENOSPC;
3113
3114        error = security_inode_init_security(inode, dir, &dentry->d_name,
3115                                             shmem_initxattrs, NULL);
3116        if (error) {
3117                if (error != -EOPNOTSUPP) {
3118                        iput(inode);
3119                        return error;
3120                }
3121                error = 0;
3122        }
3123
3124        inode->i_size = len-1;
3125        if (len <= SHORT_SYMLINK_LEN) {
3126                inode->i_link = kmemdup(symname, len, GFP_KERNEL);
3127                if (!inode->i_link) {
3128                        iput(inode);
3129                        return -ENOMEM;
3130                }
3131                inode->i_op = &shmem_short_symlink_operations;
3132        } else {
3133                inode_nohighmem(inode);
3134                error = shmem_getpage(inode, 0, &page, SGP_WRITE);
3135                if (error) {
3136                        iput(inode);
3137                        return error;
3138                }
3139                inode->i_mapping->a_ops = &shmem_aops;
3140                inode->i_op = &shmem_symlink_inode_operations;
3141                memcpy(page_address(page), symname, len);
3142                SetPageUptodate(page);
3143                set_page_dirty(page);
3144                unlock_page(page);
3145                put_page(page);
3146        }
3147        dir->i_size += BOGO_DIRENT_SIZE;
3148        dir->i_ctime = dir->i_mtime = current_time(dir);
3149        d_instantiate(dentry, inode);
3150        dget(dentry);
3151        return 0;
3152}
3153
3154static void shmem_put_link(void *arg)
3155{
3156        mark_page_accessed(arg);
3157        put_page(arg);
3158}
3159
3160static const char *shmem_get_link(struct dentry *dentry,
3161                                  struct inode *inode,
3162                                  struct delayed_call *done)
3163{
3164        struct page *page = NULL;
3165        int error;
3166        if (!dentry) {
3167                page = find_get_page(inode->i_mapping, 0);
3168                if (!page)
3169                        return ERR_PTR(-ECHILD);
3170                if (!PageUptodate(page)) {
3171                        put_page(page);
3172                        return ERR_PTR(-ECHILD);
3173                }
3174        } else {
3175                error = shmem_getpage(inode, 0, &page, SGP_READ);
3176                if (error)
3177                        return ERR_PTR(error);
3178                unlock_page(page);
3179        }
3180        set_delayed_call(done, shmem_put_link, page);
3181        return page_address(page);
3182}
3183
3184#ifdef CONFIG_TMPFS_XATTR
3185/*
3186 * Superblocks without xattr inode operations may get some security.* xattr
3187 * support from the LSM "for free". As soon as we have any other xattrs
3188 * like ACLs, we also need to implement the security.* handlers at
3189 * filesystem level, though.
3190 */
3191
3192/*
3193 * Callback for security_inode_init_security() for acquiring xattrs.
3194 */
3195static int shmem_initxattrs(struct inode *inode,
3196                            const struct xattr *xattr_array,
3197                            void *fs_info)
3198{
3199        struct shmem_inode_info *info = SHMEM_I(inode);
3200        const struct xattr *xattr;
3201        struct simple_xattr *new_xattr;
3202        size_t len;
3203
3204        for (xattr = xattr_array; xattr->name != NULL; xattr++) {
3205                new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len);
3206                if (!new_xattr)
3207                        return -ENOMEM;
3208
3209                len = strlen(xattr->name) + 1;
3210                new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len,
3211                                          GFP_KERNEL);
3212                if (!new_xattr->name) {
3213                        kfree(new_xattr);
3214                        return -ENOMEM;
3215                }
3216
3217                memcpy(new_xattr->name, XATTR_SECURITY_PREFIX,
3218                       XATTR_SECURITY_PREFIX_LEN);
3219                memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN,
3220                       xattr->name, len);
3221
3222                simple_xattr_list_add(&info->xattrs, new_xattr);
3223        }
3224
3225        return 0;
3226}
3227
3228static int shmem_xattr_handler_get(const struct xattr_handler *handler,
3229                                   struct dentry *unused, struct inode *inode,
3230                                   const char *name, void *buffer, size_t size)
3231{
3232        struct shmem_inode_info *info = SHMEM_I(inode);
3233
3234        name = xattr_full_name(handler, name);
3235        return simple_xattr_get(&info->xattrs, name, buffer, size);
3236}
3237
3238static int shmem_xattr_handler_set(const struct xattr_handler *handler,
3239                                   struct dentry *unused, struct inode *inode,
3240                                   const char *name, const void *value,
3241                                   size_t size, int flags)
3242{
3243        struct shmem_inode_info *info = SHMEM_I(inode);
3244
3245        name = xattr_full_name(handler, name);
3246        return simple_xattr_set(&info->xattrs, name, value, size, flags);
3247}
3248
3249static const struct xattr_handler shmem_security_xattr_handler = {
3250        .prefix = XATTR_SECURITY_PREFIX,
3251        .get = shmem_xattr_handler_get,
3252        .set = shmem_xattr_handler_set,
3253};
3254
3255static const struct xattr_handler shmem_trusted_xattr_handler = {
3256        .prefix = XATTR_TRUSTED_PREFIX,
3257        .get = shmem_xattr_handler_get,
3258        .set = shmem_xattr_handler_set,
3259};
3260
3261static const struct xattr_handler *shmem_xattr_handlers[] = {
3262#ifdef CONFIG_TMPFS_POSIX_ACL
3263        &posix_acl_access_xattr_handler,
3264        &posix_acl_default_xattr_handler,
3265#endif
3266        &shmem_security_xattr_handler,
3267        &shmem_trusted_xattr_handler,
3268        NULL
3269};
3270
3271static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
3272{
3273        struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
3274        return simple_xattr_list(d_inode(dentry), &info->xattrs, buffer, size);
3275}
3276#endif /* CONFIG_TMPFS_XATTR */
3277
3278static const struct inode_operations shmem_short_symlink_operations = {
3279        .get_link       = simple_get_link,
3280#ifdef CONFIG_TMPFS_XATTR
3281        .listxattr      = shmem_listxattr,
3282#endif
3283};
3284
3285static const struct inode_operations shmem_symlink_inode_operations = {
3286        .get_link       = shmem_get_link,
3287#ifdef CONFIG_TMPFS_XATTR
3288        .listxattr      = shmem_listxattr,
3289#endif
3290};
3291
3292static struct dentry *shmem_get_parent(struct dentry *child)
3293{
3294        return ERR_PTR(-ESTALE);
3295}
3296
3297static int shmem_match(struct inode *ino, void *vfh)
3298{
3299        __u32 *fh = vfh;
3300        __u64 inum = fh[2];
3301        inum = (inum << 32) | fh[1];
3302        return ino->i_ino == inum && fh[0] == ino->i_generation;
3303}
3304
3305/* Find any alias of inode, but prefer a hashed alias */
3306static struct dentry *shmem_find_alias(struct inode *inode)
3307{
3308        struct dentry *alias = d_find_alias(inode);
3309
3310        return alias ?: d_find_any_alias(inode);
3311}
3312
3313
3314static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
3315                struct fid *fid, int fh_len, int fh_type)
3316{
3317        struct inode *inode;
3318        struct dentry *dentry = NULL;
3319        u64 inum;
3320
3321        if (fh_len < 3)
3322                return NULL;
3323
3324        inum = fid->raw[2];
3325        inum = (inum << 32) | fid->raw[1];
3326
3327        inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]),
3328                        shmem_match, fid->raw);
3329        if (inode) {
3330                dentry = shmem_find_alias(inode);
3331                iput(inode);
3332        }
3333
3334        return dentry;
3335}
3336
3337static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len,
3338                                struct inode *parent)
3339{
3340        if (*len < 3) {
3341                *len = 3;
3342                return FILEID_INVALID;
3343        }
3344
3345        if (inode_unhashed(inode)) {
3346                /* Unfortunately insert_inode_hash is not idempotent,
3347                 * so as we hash inodes here rather than at creation
3348                 * time, we need a lock to ensure we only try
3349                 * to do it once
3350                 */
3351                static DEFINE_SPINLOCK(lock);
3352                spin_lock(&lock);
3353                if (inode_unhashed(inode))
3354                        __insert_inode_hash(inode,
3355                                            inode->i_ino + inode->i_generation);
3356                spin_unlock(&lock);
3357        }
3358
3359        fh[0] = inode->i_generation;
3360        fh[1] = inode->i_ino;
3361        fh[2] = ((__u64)inode->i_ino) >> 32;
3362
3363        *len = 3;
3364        return 1;
3365}
3366
3367static const struct export_operations shmem_export_ops = {
3368        .get_parent     = shmem_get_parent,
3369        .encode_fh      = shmem_encode_fh,
3370        .fh_to_dentry   = shmem_fh_to_dentry,
3371};
3372
3373enum shmem_param {
3374        Opt_gid,
3375        Opt_huge,
3376        Opt_mode,
3377        Opt_mpol,
3378        Opt_nr_blocks,
3379        Opt_nr_inodes,
3380        Opt_size,
3381        Opt_uid,
3382};
3383
3384static const struct fs_parameter_spec shmem_param_specs[] = {
3385        fsparam_u32   ("gid",           Opt_gid),
3386        fsparam_enum  ("huge",          Opt_huge),
3387        fsparam_u32oct("mode",          Opt_mode),
3388        fsparam_string("mpol",          Opt_mpol),
3389        fsparam_string("nr_blocks",     Opt_nr_blocks),
3390        fsparam_string("nr_inodes",     Opt_nr_inodes),
3391        fsparam_string("size",          Opt_size),
3392        fsparam_u32   ("uid",           Opt_uid),
3393        {}
3394};
3395
3396static const struct fs_parameter_enum shmem_param_enums[] = {
3397        { Opt_huge,     "never",        SHMEM_HUGE_NEVER },
3398        { Opt_huge,     "always",       SHMEM_HUGE_ALWAYS },
3399        { Opt_huge,     "within_size",  SHMEM_HUGE_WITHIN_SIZE },
3400        { Opt_huge,     "advise",       SHMEM_HUGE_ADVISE },
3401        {}
3402};
3403
3404const struct fs_parameter_description shmem_fs_parameters = {
3405        .name           = "tmpfs",
3406        .specs          = shmem_param_specs,
3407        .enums          = shmem_param_enums,
3408};
3409
3410static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
3411{
3412        struct shmem_options *ctx = fc->fs_private;
3413        struct fs_parse_result result;
3414        unsigned long long size;
3415        char *rest;
3416        int opt;
3417
3418        opt = fs_parse(fc, &shmem_fs_parameters, param, &result);
3419        if (opt < 0)
3420                return opt;
3421
3422        switch (opt) {
3423        case Opt_size:
3424                size = memparse(param->string, &rest);
3425                if (*rest == '%') {
3426                        size <<= PAGE_SHIFT;
3427                        size *= totalram_pages();
3428                        do_div(size, 100);
3429                        rest++;
3430                }
3431                if (*rest)
3432                        goto bad_value;
3433                ctx->blocks = DIV_ROUND_UP(size, PAGE_SIZE);
3434                ctx->seen |= SHMEM_SEEN_BLOCKS;
3435                break;
3436        case Opt_nr_blocks:
3437                ctx->blocks = memparse(param->string, &rest);
3438                if (*rest)
3439                        goto bad_value;
3440                ctx->seen |= SHMEM_SEEN_BLOCKS;
3441                break;
3442        case Opt_nr_inodes:
3443                ctx->inodes = memparse(param->string, &rest);
3444                if (*rest)
3445                        goto bad_value;
3446                ctx->seen |= SHMEM_SEEN_INODES;
3447                break;
3448        case Opt_mode:
3449                ctx->mode = result.uint_32 & 07777;
3450                break;
3451        case Opt_uid:
3452                ctx->uid = make_kuid(current_user_ns(), result.uint_32);
3453                if (!uid_valid(ctx->uid))
3454                        goto bad_value;
3455                break;
3456        case Opt_gid:
3457                ctx->gid = make_kgid(current_user_ns(), result.uint_32);
3458                if (!gid_valid(ctx->gid))
3459                        goto bad_value;
3460                break;
3461        case Opt_huge:
3462                ctx->huge = result.uint_32;
3463                if (ctx->huge != SHMEM_HUGE_NEVER &&
3464                    !(IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
3465                      has_transparent_hugepage()))
3466                        goto unsupported_parameter;
3467                ctx->seen |= SHMEM_SEEN_HUGE;
3468                break;
3469        case Opt_mpol:
3470                if (IS_ENABLED(CONFIG_NUMA)) {
3471                        mpol_put(ctx->mpol);
3472                        ctx->mpol = NULL;
3473                        if (mpol_parse_str(param->string, &ctx->mpol))
3474                                goto bad_value;
3475                        break;
3476                }
3477                goto unsupported_parameter;
3478        }
3479        return 0;
3480
3481unsupported_parameter:
3482        return invalf(fc, "tmpfs: Unsupported parameter '%s'", param->key);
3483bad_value:
3484        return invalf(fc, "tmpfs: Bad value for '%s'", param->key);
3485}
3486
3487static int shmem_parse_options(struct fs_context *fc, void *data)
3488{
3489        char *options = data;
3490
3491        if (options) {
3492                int err = security_sb_eat_lsm_opts(options, &fc->security);
3493                if (err)
3494                        return err;
3495        }
3496
3497        while (options != NULL) {
3498                char *this_char = options;
3499                for (;;) {
3500                        /*
3501                         * NUL-terminate this option: unfortunately,
3502                         * mount options form a comma-separated list,
3503                         * but mpol's nodelist may also contain commas.
3504                         */
3505                        options = strchr(options, ',');
3506                        if (options == NULL)
3507                                break;
3508                        options++;
3509                        if (!isdigit(*options)) {
3510                                options[-1] = '\0';
3511                                break;
3512                        }
3513                }
3514                if (*this_char) {
3515                        char *value = strchr(this_char,'=');
3516                        size_t len = 0;
3517                        int err;
3518
3519                        if (value) {
3520                                *value++ = '\0';
3521                                len = strlen(value);
3522                        }
3523                        err = vfs_parse_fs_string(fc, this_char, value, len);
3524                        if (err < 0)
3525                                return err;
3526                }
3527        }
3528        return 0;
3529}
3530
3531/*
3532 * Reconfigure a shmem filesystem.
3533 *
3534 * Note that we disallow change from limited->unlimited blocks/inodes while any
3535 * are in use; but we must separately disallow unlimited->limited, because in
3536 * that case we have no record of how much is already in use.
3537 */
3538static int shmem_reconfigure(struct fs_context *fc)
3539{
3540        struct shmem_options *ctx = fc->fs_private;
3541        struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb);
3542        unsigned long inodes;
3543        const char *err;
3544
3545        spin_lock(&sbinfo->stat_lock);
3546        inodes = sbinfo->max_inodes - sbinfo->free_inodes;
3547        if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) {
3548                if (!sbinfo->max_blocks) {
3549                        err = "Cannot retroactively limit size";
3550                        goto out;
3551                }
3552                if (percpu_counter_compare(&sbinfo->used_blocks,
3553                                           ctx->blocks) > 0) {
3554                        err = "Too small a size for current use";
3555                        goto out;
3556                }
3557        }
3558        if ((ctx->seen & SHMEM_SEEN_INODES) && ctx->inodes) {
3559                if (!sbinfo->max_inodes) {
3560                        err = "Cannot retroactively limit inodes";
3561                        goto out;
3562                }
3563                if (ctx->inodes < inodes) {
3564                        err = "Too few inodes for current use";
3565                        goto out;
3566                }
3567        }
3568
3569        if (ctx->seen & SHMEM_SEEN_HUGE)
3570                sbinfo->huge = ctx->huge;
3571        if (ctx->seen & SHMEM_SEEN_BLOCKS)
3572                sbinfo->max_blocks  = ctx->blocks;
3573        if (ctx->seen & SHMEM_SEEN_INODES) {
3574                sbinfo->max_inodes  = ctx->inodes;
3575                sbinfo->free_inodes = ctx->inodes - inodes;
3576        }
3577
3578        /*
3579         * Preserve previous mempolicy unless mpol remount option was specified.
3580         */
3581        if (ctx->mpol) {
3582                mpol_put(sbinfo->mpol);
3583                sbinfo->mpol = ctx->mpol;       /* transfers initial ref */
3584                ctx->mpol = NULL;
3585        }
3586        spin_unlock(&sbinfo->stat_lock);
3587        return 0;
3588out:
3589        spin_unlock(&sbinfo->stat_lock);
3590        return invalf(fc, "tmpfs: %s", err);
3591}
3592
3593static int shmem_show_options(struct seq_file *seq, struct dentry *root)
3594{
3595        struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb);
3596
3597        if (sbinfo->max_blocks != shmem_default_max_blocks())
3598                seq_printf(seq, ",size=%luk",
3599                        sbinfo->max_blocks << (PAGE_SHIFT - 10));
3600        if (sbinfo->max_inodes != shmem_default_max_inodes())
3601                seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes);
3602        if (sbinfo->mode != (0777 | S_ISVTX))
3603                seq_printf(seq, ",mode=%03ho", sbinfo->mode);
3604        if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID))
3605                seq_printf(seq, ",uid=%u",
3606                                from_kuid_munged(&init_user_ns, sbinfo->uid));
3607        if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
3608                seq_printf(seq, ",gid=%u",
3609                                from_kgid_munged(&init_user_ns, sbinfo->gid));
3610#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
3611        /* Rightly or wrongly, show huge mount option unmasked by shmem_huge */
3612        if (sbinfo->huge)
3613                seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge));
3614#endif
3615        shmem_show_mpol(seq, sbinfo->mpol);
3616        return 0;
3617}
3618
3619#endif /* CONFIG_TMPFS */
3620
3621static void shmem_put_super(struct super_block *sb)
3622{
3623        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
3624
3625        percpu_counter_destroy(&sbinfo->used_blocks);
3626        mpol_put(sbinfo->mpol);
3627        kfree(sbinfo);
3628        sb->s_fs_info = NULL;
3629}
3630
3631static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
3632{
3633        struct shmem_options *ctx = fc->fs_private;
3634        struct inode *inode;
3635        struct shmem_sb_info *sbinfo;
3636        int err = -ENOMEM;
3637
3638        /* Round up to L1_CACHE_BYTES to resist false sharing */
3639        sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info),
3640                                L1_CACHE_BYTES), GFP_KERNEL);
3641        if (!sbinfo)
3642                return -ENOMEM;
3643
3644        sb->s_fs_info = sbinfo;
3645
3646#ifdef CONFIG_TMPFS
3647        /*
3648         * Per default we only allow half of the physical ram per
3649         * tmpfs instance, limiting inodes to one per page of lowmem;
3650         * but the internal instance is left unlimited.
3651         */
3652        if (!(sb->s_flags & SB_KERNMOUNT)) {
3653                if (!(ctx->seen & SHMEM_SEEN_BLOCKS))
3654                        ctx->blocks = shmem_default_max_blocks();
3655                if (!(ctx->seen & SHMEM_SEEN_INODES))
3656                        ctx->inodes = shmem_default_max_inodes();
3657        } else {
3658                sb->s_flags |= SB_NOUSER;
3659        }
3660        sb->s_export_op = &shmem_export_ops;
3661        sb->s_flags |= SB_NOSEC;
3662#else
3663        sb->s_flags |= SB_NOUSER;
3664#endif
3665        sbinfo->max_blocks = ctx->blocks;
3666        sbinfo->free_inodes = sbinfo->max_inodes = ctx->inodes;
3667        sbinfo->uid = ctx->uid;
3668        sbinfo->gid = ctx->gid;
3669        sbinfo->mode = ctx->mode;
3670        sbinfo->huge = ctx->huge;
3671        sbinfo->mpol = ctx->mpol;
3672        ctx->mpol = NULL;
3673
3674        spin_lock_init(&sbinfo->stat_lock);
3675        if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL))
3676                goto failed;
3677        spin_lock_init(&sbinfo->shrinklist_lock);
3678        INIT_LIST_HEAD(&sbinfo->shrinklist);
3679
3680        sb->s_maxbytes = MAX_LFS_FILESIZE;
3681        sb->s_blocksize = PAGE_SIZE;
3682        sb->s_blocksize_bits = PAGE_SHIFT;
3683        sb->s_magic = TMPFS_MAGIC;
3684        sb->s_op = &shmem_ops;
3685        sb->s_time_gran = 1;
3686#ifdef CONFIG_TMPFS_XATTR
3687        sb->s_xattr = shmem_xattr_handlers;
3688#endif
3689#ifdef CONFIG_TMPFS_POSIX_ACL
3690        sb->s_flags |= SB_POSIXACL;
3691#endif
3692        uuid_gen(&sb->s_uuid);
3693
3694        inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
3695        if (!inode)
3696                goto failed;
3697        inode->i_uid = sbinfo->uid;
3698        inode->i_gid = sbinfo->gid;
3699        sb->s_root = d_make_root(inode);
3700        if (!sb->s_root)
3701                goto failed;
3702        return 0;
3703
3704failed:
3705        shmem_put_super(sb);
3706        return err;
3707}
3708
3709static int shmem_get_tree(struct fs_context *fc)
3710{
3711        return get_tree_nodev(fc, shmem_fill_super);
3712}
3713
3714static void shmem_free_fc(struct fs_context *fc)
3715{
3716        struct shmem_options *ctx = fc->fs_private;
3717
3718        if (ctx) {
3719                mpol_put(ctx->mpol);
3720                kfree(ctx);
3721        }
3722}
3723
3724static const struct fs_context_operations shmem_fs_context_ops = {
3725        .free                   = shmem_free_fc,
3726        .get_tree               = shmem_get_tree,
3727#ifdef CONFIG_TMPFS
3728        .parse_monolithic       = shmem_parse_options,
3729        .parse_param            = shmem_parse_one,
3730        .reconfigure            = shmem_reconfigure,
3731#endif
3732};
3733
3734static struct kmem_cache *shmem_inode_cachep;
3735
3736static struct inode *shmem_alloc_inode(struct super_block *sb)
3737{
3738        struct shmem_inode_info *info;
3739        info = kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
3740        if (!info)
3741                return NULL;
3742        return &info->vfs_inode;
3743}
3744
3745static void shmem_free_in_core_inode(struct inode *inode)
3746{
3747        if (S_ISLNK(inode->i_mode))
3748                kfree(inode->i_link);
3749        kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
3750}
3751
3752static void shmem_destroy_inode(struct inode *inode)
3753{
3754        if (S_ISREG(inode->i_mode))
3755                mpol_free_shared_policy(&SHMEM_I(inode)->policy);
3756}
3757
3758static void shmem_init_inode(void *foo)
3759{
3760        struct shmem_inode_info *info = foo;
3761        inode_init_once(&info->vfs_inode);
3762}
3763
3764static void shmem_init_inodecache(void)
3765{
3766        shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
3767                                sizeof(struct shmem_inode_info),
3768                                0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode);
3769}
3770
3771static void shmem_destroy_inodecache(void)
3772{
3773        kmem_cache_destroy(shmem_inode_cachep);
3774}
3775
3776static const struct address_space_operations shmem_aops = {
3777        .writepage      = shmem_writepage,
3778        .set_page_dirty = __set_page_dirty_no_writeback,
3779#ifdef CONFIG_TMPFS
3780        .write_begin    = shmem_write_begin,
3781        .write_end      = shmem_write_end,
3782#endif
3783#ifdef CONFIG_MIGRATION
3784        .migratepage    = migrate_page,
3785#endif
3786        .error_remove_page = generic_error_remove_page,
3787};
3788
3789static const struct file_operations shmem_file_operations = {
3790        .mmap           = shmem_mmap,
3791        .get_unmapped_area = shmem_get_unmapped_area,
3792#ifdef CONFIG_TMPFS
3793        .llseek         = shmem_file_llseek,
3794        .read_iter      = shmem_file_read_iter,
3795        .write_iter     = generic_file_write_iter,
3796        .fsync          = noop_fsync,
3797        .splice_read    = generic_file_splice_read,
3798        .splice_write   = iter_file_splice_write,
3799        .fallocate      = shmem_fallocate,
3800#endif
3801};
3802
3803static const struct inode_operations shmem_inode_operations = {
3804        .getattr        = shmem_getattr,
3805        .setattr        = shmem_setattr,
3806#ifdef CONFIG_TMPFS_XATTR
3807        .listxattr      = shmem_listxattr,
3808        .set_acl        = simple_set_acl,
3809#endif
3810};
3811
3812static const struct inode_operations shmem_dir_inode_operations = {
3813#ifdef CONFIG_TMPFS
3814        .create         = shmem_create,
3815        .lookup         = simple_lookup,
3816        .link           = shmem_link,
3817        .unlink         = shmem_unlink,
3818        .symlink        = shmem_symlink,
3819        .mkdir          = shmem_mkdir,
3820        .rmdir          = shmem_rmdir,
3821        .mknod          = shmem_mknod,
3822        .rename         = shmem_rename2,
3823        .tmpfile        = shmem_tmpfile,
3824#endif
3825#ifdef CONFIG_TMPFS_XATTR
3826        .listxattr      = shmem_listxattr,
3827#endif
3828#ifdef CONFIG_TMPFS_POSIX_ACL
3829        .setattr        = shmem_setattr,
3830        .set_acl        = simple_set_acl,
3831#endif
3832};
3833
3834static const struct inode_operations shmem_special_inode_operations = {
3835#ifdef CONFIG_TMPFS_XATTR
3836        .listxattr      = shmem_listxattr,
3837#endif
3838#ifdef CONFIG_TMPFS_POSIX_ACL
3839        .setattr        = shmem_setattr,
3840        .set_acl        = simple_set_acl,
3841#endif
3842};
3843
3844static const struct super_operations shmem_ops = {
3845        .alloc_inode    = shmem_alloc_inode,
3846        .free_inode     = shmem_free_in_core_inode,
3847        .destroy_inode  = shmem_destroy_inode,
3848#ifdef CONFIG_TMPFS
3849        .statfs         = shmem_statfs,
3850        .show_options   = shmem_show_options,
3851#endif
3852        .evict_inode    = shmem_evict_inode,
3853        .drop_inode     = generic_delete_inode,
3854        .put_super      = shmem_put_super,
3855#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
3856        .nr_cached_objects      = shmem_unused_huge_count,
3857        .free_cached_objects    = shmem_unused_huge_scan,
3858#endif
3859};
3860
3861static const struct vm_operations_struct shmem_vm_ops = {
3862        .fault          = shmem_fault,
3863        .map_pages      = filemap_map_pages,
3864#ifdef CONFIG_NUMA
3865        .set_policy     = shmem_set_policy,
3866        .get_policy     = shmem_get_policy,
3867#endif
3868};
3869
3870int shmem_init_fs_context(struct fs_context *fc)
3871{
3872        struct shmem_options *ctx;
3873
3874        ctx = kzalloc(sizeof(struct shmem_options), GFP_KERNEL);
3875        if (!ctx)
3876                return -ENOMEM;
3877
3878        ctx->mode = 0777 | S_ISVTX;
3879        ctx->uid = current_fsuid();
3880        ctx->gid = current_fsgid();
3881
3882        fc->fs_private = ctx;
3883        fc->ops = &shmem_fs_context_ops;
3884        return 0;
3885}
3886
3887static struct file_system_type shmem_fs_type = {
3888        .owner          = THIS_MODULE,
3889        .name           = "tmpfs",
3890        .init_fs_context = shmem_init_fs_context,
3891#ifdef CONFIG_TMPFS
3892        .parameters     = &shmem_fs_parameters,
3893#endif
3894        .kill_sb        = kill_litter_super,
3895        .fs_flags       = FS_USERNS_MOUNT,
3896};
3897
3898int __init shmem_init(void)
3899{
3900        int error;
3901
3902        shmem_init_inodecache();
3903
3904        error = register_filesystem(&shmem_fs_type);
3905        if (error) {
3906                pr_err("Could not register tmpfs\n");
3907                goto out2;
3908        }
3909
3910        shm_mnt = kern_mount(&shmem_fs_type);
3911        if (IS_ERR(shm_mnt)) {
3912                error = PTR_ERR(shm_mnt);
3913                pr_err("Could not kern_mount tmpfs\n");
3914                goto out1;
3915        }
3916
3917#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
3918        if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY)
3919                SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
3920        else
3921                shmem_huge = 0; /* just in case it was patched */
3922#endif
3923        return 0;
3924
3925out1:
3926        unregister_filesystem(&shmem_fs_type);
3927out2:
3928        shmem_destroy_inodecache();
3929        shm_mnt = ERR_PTR(error);
3930        return error;
3931}
3932
3933#if defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && defined(CONFIG_SYSFS)
3934static ssize_t shmem_enabled_show(struct kobject *kobj,
3935                struct kobj_attribute *attr, char *buf)
3936{
3937        static const int values[] = {
3938                SHMEM_HUGE_ALWAYS,
3939                SHMEM_HUGE_WITHIN_SIZE,
3940                SHMEM_HUGE_ADVISE,
3941                SHMEM_HUGE_NEVER,
3942                SHMEM_HUGE_DENY,
3943                SHMEM_HUGE_FORCE,
3944        };
3945        int i, count;
3946
3947        for (i = 0, count = 0; i < ARRAY_SIZE(values); i++) {
3948                const char *fmt = shmem_huge == values[i] ? "[%s] " : "%s ";
3949
3950                count += sprintf(buf + count, fmt,
3951                                shmem_format_huge(values[i]));
3952        }
3953        buf[count - 1] = '\n';
3954        return count;
3955}
3956
3957static ssize_t shmem_enabled_store(struct kobject *kobj,
3958                struct kobj_attribute *attr, const char *buf, size_t count)
3959{
3960        char tmp[16];
3961        int huge;
3962
3963        if (count + 1 > sizeof(tmp))
3964                return -EINVAL;
3965        memcpy(tmp, buf, count);
3966        tmp[count] = '\0';
3967        if (count && tmp[count - 1] == '\n')
3968                tmp[count - 1] = '\0';
3969
3970        huge = shmem_parse_huge(tmp);
3971        if (huge == -EINVAL)
3972                return -EINVAL;
3973        if (!has_transparent_hugepage() &&
3974                        huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY)
3975                return -EINVAL;
3976
3977        shmem_huge = huge;
3978        if (shmem_huge > SHMEM_HUGE_DENY)
3979                SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
3980        return count;
3981}
3982
3983struct kobj_attribute shmem_enabled_attr =
3984        __ATTR(shmem_enabled, 0644, shmem_enabled_show, shmem_enabled_store);
3985#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE && CONFIG_SYSFS */
3986
3987#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
3988bool shmem_huge_enabled(struct vm_area_struct *vma)
3989{
3990        struct inode *inode = file_inode(vma->vm_file);
3991        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
3992        loff_t i_size;
3993        pgoff_t off;
3994
3995        if ((vma->vm_flags & VM_NOHUGEPAGE) ||
3996            test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
3997                return false;
3998        if (shmem_huge == SHMEM_HUGE_FORCE)
3999                return true;
4000        if (shmem_huge == SHMEM_HUGE_DENY)
4001                return false;
4002        switch (sbinfo->huge) {
4003                case SHMEM_HUGE_NEVER:
4004                        return false;
4005                case SHMEM_HUGE_ALWAYS:
4006                        return true;
4007                case SHMEM_HUGE_WITHIN_SIZE:
4008                        off = round_up(vma->vm_pgoff, HPAGE_PMD_NR);
4009                        i_size = round_up(i_size_read(inode), PAGE_SIZE);
4010                        if (i_size >= HPAGE_PMD_SIZE &&
4011                                        i_size >> PAGE_SHIFT >= off)
4012                                return true;
4013                        /* fall through */
4014                case SHMEM_HUGE_ADVISE:
4015                        /* TODO: implement fadvise() hints */
4016                        return (vma->vm_flags & VM_HUGEPAGE);
4017                default:
4018                        VM_BUG_ON(1);
4019                        return false;
4020        }
4021}
4022#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */
4023
4024#else /* !CONFIG_SHMEM */
4025
4026/*
4027 * tiny-shmem: simple shmemfs and tmpfs using ramfs code
4028 *
4029 * This is intended for small system where the benefits of the full
4030 * shmem code (swap-backed and resource-limited) are outweighed by
4031 * their complexity. On systems without swap this code should be
4032 * effectively equivalent, but much lighter weight.
4033 */
4034
4035static struct file_system_type shmem_fs_type = {
4036        .name           = "tmpfs",
4037        .init_fs_context = ramfs_init_fs_context,
4038        .parameters     = &ramfs_fs_parameters,
4039        .kill_sb        = kill_litter_super,
4040        .fs_flags       = FS_USERNS_MOUNT,
4041};
4042
4043int __init shmem_init(void)
4044{
4045        BUG_ON(register_filesystem(&shmem_fs_type) != 0);
4046
4047        shm_mnt = kern_mount(&shmem_fs_type);
4048        BUG_ON(IS_ERR(shm_mnt));
4049
4050        return 0;
4051}
4052
4053int shmem_unuse(unsigned int type, bool frontswap,
4054                unsigned long *fs_pages_to_unuse)
4055{
4056        return 0;
4057}
4058
4059int shmem_lock(struct file *file, int lock, struct user_struct *user)
4060{
4061        return 0;
4062}
4063
4064void shmem_unlock_mapping(struct address_space *mapping)
4065{
4066}
4067
4068#ifdef CONFIG_MMU
4069unsigned long shmem_get_unmapped_area(struct file *file,
4070                                      unsigned long addr, unsigned long len,
4071                                      unsigned long pgoff, unsigned long flags)
4072{
4073        return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
4074}
4075#endif
4076
4077void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
4078{
4079        truncate_inode_pages_range(inode->i_mapping, lstart, lend);
4080}
4081EXPORT_SYMBOL_GPL(shmem_truncate_range);
4082
4083#define shmem_vm_ops                            generic_file_vm_ops
4084#define shmem_file_operations                   ramfs_file_operations
4085#define shmem_get_inode(sb, dir, mode, dev, flags)      ramfs_get_inode(sb, dir, mode, dev)
4086#define shmem_acct_size(flags, size)            0
4087#define shmem_unacct_size(flags, size)          do {} while (0)
4088
4089#endif /* CONFIG_SHMEM */
4090
4091/* common code */
4092
4093static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, loff_t size,
4094                                       unsigned long flags, unsigned int i_flags)
4095{
4096        struct inode *inode;
4097        struct file *res;
4098
4099        if (IS_ERR(mnt))
4100                return ERR_CAST(mnt);
4101
4102        if (size < 0 || size > MAX_LFS_FILESIZE)
4103                return ERR_PTR(-EINVAL);
4104
4105        if (shmem_acct_size(flags, size))
4106                return ERR_PTR(-ENOMEM);
4107
4108        inode = shmem_get_inode(mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0,
4109                                flags);
4110        if (unlikely(!inode)) {
4111                shmem_unacct_size(flags, size);
4112                return ERR_PTR(-ENOSPC);
4113        }
4114        inode->i_flags |= i_flags;
4115        inode->i_size = size;
4116        clear_nlink(inode);     /* It is unlinked */
4117        res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size));
4118        if (!IS_ERR(res))
4119                res = alloc_file_pseudo(inode, mnt, name, O_RDWR,
4120                                &shmem_file_operations);
4121        if (IS_ERR(res))
4122                iput(inode);
4123        return res;
4124}
4125
4126/**
4127 * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be
4128 *      kernel internal.  There will be NO LSM permission checks against the
4129 *      underlying inode.  So users of this interface must do LSM checks at a
4130 *      higher layer.  The users are the big_key and shm implementations.  LSM
4131 *      checks are provided at the key or shm level rather than the inode.
4132 * @name: name for dentry (to be seen in /proc/<pid>/maps
4133 * @size: size to be set for the file
4134 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
4135 */
4136struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags)
4137{
4138        return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE);
4139}
4140
4141/**
4142 * shmem_file_setup - get an unlinked file living in tmpfs
4143 * @name: name for dentry (to be seen in /proc/<pid>/maps
4144 * @size: size to be set for the file
4145 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
4146 */
4147struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags)
4148{
4149        return __shmem_file_setup(shm_mnt, name, size, flags, 0);
4150}
4151EXPORT_SYMBOL_GPL(shmem_file_setup);
4152
4153/**
4154 * shmem_file_setup_with_mnt - get an unlinked file living in tmpfs
4155 * @mnt: the tmpfs mount where the file will be created
4156 * @name: name for dentry (to be seen in /proc/<pid>/maps
4157 * @size: size to be set for the file
4158 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
4159 */
4160struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name,
4161                                       loff_t size, unsigned long flags)
4162{
4163        return __shmem_file_setup(mnt, name, size, flags, 0);
4164}
4165EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt);
4166
4167/**
4168 * shmem_zero_setup - setup a shared anonymous mapping
4169 * @vma: the vma to be mmapped is prepared by do_mmap_pgoff
4170 */
4171int shmem_zero_setup(struct vm_area_struct *vma)
4172{
4173        struct file *file;
4174        loff_t size = vma->vm_end - vma->vm_start;
4175
4176        /*
4177         * Cloning a new file under mmap_sem leads to a lock ordering conflict
4178         * between XFS directory reading and selinux: since this file is only
4179         * accessible to the user through its mapping, use S_PRIVATE flag to
4180         * bypass file security, in the same way as shmem_kernel_file_setup().
4181         */
4182        file = shmem_kernel_file_setup("dev/zero", size, vma->vm_flags);
4183        if (IS_ERR(file))
4184                return PTR_ERR(file);
4185
4186        if (vma->vm_file)
4187                fput(vma->vm_file);
4188        vma->vm_file = file;
4189        vma->vm_ops = &shmem_vm_ops;
4190
4191        if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
4192                        ((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) <
4193                        (vma->vm_end & HPAGE_PMD_MASK)) {
4194                khugepaged_enter(vma, vma->vm_flags);
4195        }
4196
4197        return 0;
4198}
4199
4200/**
4201 * shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags.
4202 * @mapping:    the page's address_space
4203 * @index:      the page index
4204 * @gfp:        the page allocator flags to use if allocating
4205 *
4206 * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)",
4207 * with any new page allocations done using the specified allocation flags.
4208 * But read_cache_page_gfp() uses the ->readpage() method: which does not
4209 * suit tmpfs, since it may have pages in swapcache, and needs to find those
4210 * for itself; although drivers/gpu/drm i915 and ttm rely upon this support.
4211 *
4212 * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in
4213 * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily.
4214 */
4215struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
4216                                         pgoff_t index, gfp_t gfp)
4217{
4218#ifdef CONFIG_SHMEM
4219        struct inode *inode = mapping->host;
4220        struct page *page;
4221        int error;
4222
4223        BUG_ON(mapping->a_ops != &shmem_aops);
4224        error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE,
4225                                  gfp, NULL, NULL, NULL);
4226        if (error)
4227                page = ERR_PTR(error);
4228        else
4229                unlock_page(page);
4230        return page;
4231#else
4232        /*
4233         * The tiny !SHMEM case uses ramfs without swap
4234         */
4235        return read_cache_page_gfp(mapping, index, gfp);
4236#endif
4237}
4238EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);
4239