LXR linux/mm/nommu.c

   1/*
   2 *  linux/mm/nommu.c
   3 *
   4 *  Replacement code for mm functions to support CPU's that don't
   5 *  have any form of memory management unit (thus no virtual memory).
   6 *
   7 *  See Documentation/nommu-mmap.txt
   8 *
   9 *  Copyright (c) 2004-2008 David Howells <dhowells@redhat.com>
  10 *  Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com>
  11 *  Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org>
  12 *  Copyright (c) 2002      Greg Ungerer <gerg@snapgear.com>
  13 *  Copyright (c) 2007-2010 Paul Mundt <lethal@linux-sh.org>
  14 */
  15
  16#include <linux/export.h>
  17#include <linux/mm.h>
  18#include <linux/mman.h>
  19#include <linux/swap.h>
  20#include <linux/file.h>
  21#include <linux/highmem.h>
  22#include <linux/pagemap.h>
  23#include <linux/slab.h>
  24#include <linux/vmalloc.h>
  25#include <linux/blkdev.h>
  26#include <linux/backing-dev.h>
  27#include <linux/mount.h>
  28#include <linux/personality.h>
  29#include <linux/security.h>
  30#include <linux/syscalls.h>
  31#include <linux/audit.h>
  32#include <linux/sched/sysctl.h>
  33
  34#include <asm/uaccess.h>
  35#include <asm/tlb.h>
  36#include <asm/tlbflush.h>
  37#include <asm/mmu_context.h>
  38#include "internal.h"
  39
  40#if 0
  41#define kenter(FMT, ...) \
  42        printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__)
  43#define kleave(FMT, ...) \
  44        printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__)
  45#define kdebug(FMT, ...) \
  46        printk(KERN_DEBUG "xxx" FMT"yyy\n", ##__VA_ARGS__)
  47#else
  48#define kenter(FMT, ...) \
  49        no_printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__)
  50#define kleave(FMT, ...) \
  51        no_printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__)
  52#define kdebug(FMT, ...) \
  53        no_printk(KERN_DEBUG FMT"\n", ##__VA_ARGS__)
  54#endif
  55
  56void *high_memory;
  57struct page *mem_map;
  58unsigned long max_mapnr;
  59unsigned long num_physpages;
  60unsigned long highest_memmap_pfn;
  61struct percpu_counter vm_committed_as;
  62int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
  63int sysctl_overcommit_ratio = 50; /* default is 50% */
  64unsigned long sysctl_overcommit_kbytes __read_mostly;
  65int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
  66int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
  67unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
  68unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
  69int heap_stack_gap = 0;
  70
  71atomic_long_t mmap_pages_allocated;
  72
  73/*
  74 * The global memory commitment made in the system can be a metric
  75 * that can be used to drive ballooning decisions when Linux is hosted
  76 * as a guest. On Hyper-V, the host implements a policy engine for dynamically
  77 * balancing memory across competing virtual machines that are hosted.
  78 * Several metrics drive this policy engine including the guest reported
  79 * memory commitment.
  80 */
  81unsigned long vm_memory_committed(void)
  82{
  83        return percpu_counter_read_positive(&vm_committed_as);
  84}
  85
  86EXPORT_SYMBOL_GPL(vm_memory_committed);
  87
  88EXPORT_SYMBOL(mem_map);
  89EXPORT_SYMBOL(num_physpages);
  90
  91/* list of mapped, potentially shareable regions */
  92static struct kmem_cache *vm_region_jar;
  93struct rb_root nommu_region_tree = RB_ROOT;
  94DECLARE_RWSEM(nommu_region_sem);
  95
  96const struct vm_operations_struct generic_file_vm_ops = {
  97};
  98
  99/*
 100 * Return the total memory allocated for this pointer, not
 101 * just what the caller asked for.
 102 *
 103 * Doesn't have to be accurate, i.e. may have races.
 104 */
 105unsigned int kobjsize(const void *objp)
 106{
 107        struct page *page;
 108
 109        /*
 110         * If the object we have should not have ksize performed on it,
 111         * return size of 0
 112         */
 113        if (!objp || !virt_addr_valid(objp))
 114                return 0;
 115
 116        page = virt_to_head_page(objp);
 117
 118        /*
 119         * If the allocator sets PageSlab, we know the pointer came from
 120         * kmalloc().
 121         */
 122        if (PageSlab(page))
 123                return ksize(objp);
 124
 125        /*
 126         * If it's not a compound page, see if we have a matching VMA
 127         * region. This test is intentionally done in reverse order,
 128         * so if there's no VMA, we still fall through and hand back
 129         * PAGE_SIZE for 0-order pages.
 130         */
 131        if (!PageCompound(page)) {
 132                struct vm_area_struct *vma;
 133
 134                vma = find_vma(current->mm, (unsigned long)objp);
 135                if (vma)
 136                        return vma->vm_end - vma->vm_start;
 137        }
 138
 139        /*
 140         * The ksize() function is only guaranteed to work for pointers
 141         * returned by kmalloc(). So handle arbitrary pointers here.
 142         */
 143        return PAGE_SIZE << compound_order(page);
 144}
 145
 146long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 147                      unsigned long start, unsigned long nr_pages,
 148                      unsigned int foll_flags, struct page **pages,
 149                      struct vm_area_struct **vmas, int *nonblocking)
 150{
 151        struct vm_area_struct *vma;
 152        unsigned long vm_flags;
 153        int i;
 154
 155        /* calculate required read or write permissions.
 156         * If FOLL_FORCE is set, we only require the "MAY" flags.
 157         */
 158        vm_flags  = (foll_flags & FOLL_WRITE) ?
 159                        (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
 160        vm_flags &= (foll_flags & FOLL_FORCE) ?
 161                        (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
 162
 163        for (i = 0; i < nr_pages; i++) {
 164                vma = find_vma(mm, start);
 165                if (!vma)
 166                        goto finish_or_fault;
 167
 168                /* protect what we can, including chardevs */
 169                if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
 170                    !(vm_flags & vma->vm_flags))
 171                        goto finish_or_fault;
 172
 173                if (pages) {
 174                        pages[i] = virt_to_page(start);
 175                        if (pages[i])
 176                                page_cache_get(pages[i]);
 177                }
 178                if (vmas)
 179                        vmas[i] = vma;
 180                start = (start + PAGE_SIZE) & PAGE_MASK;
 181        }
 182
 183        return i;
 184
 185finish_or_fault:
 186        return i ? : -EFAULT;
 187}
 188
 189/*
 190 * get a list of pages in an address range belonging to the specified process
 191 * and indicate the VMA that covers each page
 192 * - this is potentially dodgy as we may end incrementing the page count of a
 193 *   slab page or a secondary page from a compound page
 194 * - don't permit access to VMAs that don't support it, such as I/O mappings
 195 */
 196long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 197                    unsigned long start, unsigned long nr_pages,
 198                    int write, int force, struct page **pages,
 199                    struct vm_area_struct **vmas)
 200{
 201        int flags = 0;
 202
 203        if (write)
 204                flags |= FOLL_WRITE;
 205        if (force)
 206                flags |= FOLL_FORCE;
 207
 208        return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
 209                                NULL);
 210}
 211EXPORT_SYMBOL(get_user_pages);
 212
 213long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm,
 214                           unsigned long start, unsigned long nr_pages,
 215                           int write, int force, struct page **pages,
 216                           int *locked)
 217{
 218        return get_user_pages(tsk, mm, start, nr_pages, write, force,
 219                              pages, NULL);
 220}
 221EXPORT_SYMBOL(get_user_pages_locked);
 222
 223long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
 224                               unsigned long start, unsigned long nr_pages,
 225                               int write, int force, struct page **pages,
 226                               unsigned int gup_flags)
 227{
 228        long ret;
 229        down_read(&mm->mmap_sem);
 230        ret = get_user_pages(tsk, mm, start, nr_pages, write, force,
 231                             pages, NULL);
 232        up_read(&mm->mmap_sem);
 233        return ret;
 234}
 235EXPORT_SYMBOL(__get_user_pages_unlocked);
 236
 237long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
 238                             unsigned long start, unsigned long nr_pages,
 239                             int write, int force, struct page **pages)
 240{
 241        return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write,
 242                                         force, pages, 0);
 243}
 244EXPORT_SYMBOL(get_user_pages_unlocked);
 245
 246/**
 247 * follow_pfn - look up PFN at a user virtual address
 248 * @vma: memory mapping
 249 * @address: user virtual address
 250 * @pfn: location to store found PFN
 251 *
 252 * Only IO mappings and raw PFN mappings are allowed.
 253 *
 254 * Returns zero and the pfn at @pfn on success, -ve otherwise.
 255 */
 256int follow_pfn(struct vm_area_struct *vma, unsigned long address,
 257        unsigned long *pfn)
 258{
 259        if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
 260                return -EINVAL;
 261
 262        *pfn = address >> PAGE_SHIFT;
 263        return 0;
 264}
 265EXPORT_SYMBOL(follow_pfn);
 266
 267LIST_HEAD(vmap_area_list);
 268
 269void vfree(const void *addr)
 270{
 271        kfree(addr);
 272}
 273EXPORT_SYMBOL(vfree);
 274
 275void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
 276{
 277        /*
 278         *  You can't specify __GFP_HIGHMEM with kmalloc() since kmalloc()
 279         * returns only a logical address.
 280         */
 281        return kmalloc(size, (gfp_mask | __GFP_COMP) & ~__GFP_HIGHMEM);
 282}
 283EXPORT_SYMBOL(__vmalloc);
 284
 285void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags)
 286{
 287        return __vmalloc(size, flags, PAGE_KERNEL);
 288}
 289
 290void *vmalloc_user(unsigned long size)
 291{
 292        void *ret;
 293
 294        ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
 295                        PAGE_KERNEL);
 296        if (ret) {
 297                struct vm_area_struct *vma;
 298
 299                down_write(&current->mm->mmap_sem);
 300                vma = find_vma(current->mm, (unsigned long)ret);
 301                if (vma)
 302                        vma->vm_flags |= VM_USERMAP;
 303                up_write(&current->mm->mmap_sem);
 304        }
 305
 306        return ret;
 307}
 308EXPORT_SYMBOL(vmalloc_user);
 309
 310struct page *vmalloc_to_page(const void *addr)
 311{
 312        return virt_to_page(addr);
 313}
 314EXPORT_SYMBOL(vmalloc_to_page);
 315
 316unsigned long vmalloc_to_pfn(const void *addr)
 317{
 318        return page_to_pfn(virt_to_page(addr));
 319}
 320EXPORT_SYMBOL(vmalloc_to_pfn);
 321
 322long vread(char *buf, char *addr, unsigned long count)
 323{
 324        memcpy(buf, addr, count);
 325        return count;
 326}
 327
 328long vwrite(char *buf, char *addr, unsigned long count)
 329{
 330        /* Don't allow overflow */
 331        if ((unsigned long) addr + count < count)
 332                count = -(unsigned long) addr;
 333
 334        memcpy(addr, buf, count);
 335        return(count);
 336}
 337
 338/*
 339 *      vmalloc  -  allocate virtually continguos memory
 340 *
 341 *      @size:          allocation size
 342 *
 343 *      Allocate enough pages to cover @size from the page level
 344 *      allocator and map them into continguos kernel virtual space.
 345 *
 346 *      For tight control over page level allocator and protection flags
 347 *      use __vmalloc() instead.
 348 */
 349void *vmalloc(unsigned long size)
 350{
 351       return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
 352}
 353EXPORT_SYMBOL(vmalloc);
 354
 355/*
 356 *      vzalloc - allocate virtually continguos memory with zero fill
 357 *
 358 *      @size:          allocation size
 359 *
 360 *      Allocate enough pages to cover @size from the page level
 361 *      allocator and map them into continguos kernel virtual space.
 362 *      The memory allocated is set to zero.
 363 *
 364 *      For tight control over page level allocator and protection flags
 365 *      use __vmalloc() instead.
 366 */
 367void *vzalloc(unsigned long size)
 368{
 369        return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
 370                        PAGE_KERNEL);
 371}
 372EXPORT_SYMBOL(vzalloc);
 373
 374/**
 375 * vmalloc_node - allocate memory on a specific node
 376 * @size:       allocation size
 377 * @node:       numa node
 378 *
 379 * Allocate enough pages to cover @size from the page level
 380 * allocator and map them into contiguous kernel virtual space.
 381 *
 382 * For tight control over page level allocator and protection flags
 383 * use __vmalloc() instead.
 384 */
 385void *vmalloc_node(unsigned long size, int node)
 386{
 387        return vmalloc(size);
 388}
 389EXPORT_SYMBOL(vmalloc_node);
 390
 391/**
 392 * vzalloc_node - allocate memory on a specific node with zero fill
 393 * @size:       allocation size
 394 * @node:       numa node
 395 *
 396 * Allocate enough pages to cover @size from the page level
 397 * allocator and map them into contiguous kernel virtual space.
 398 * The memory allocated is set to zero.
 399 *
 400 * For tight control over page level allocator and protection flags
 401 * use __vmalloc() instead.
 402 */
 403void *vzalloc_node(unsigned long size, int node)
 404{
 405        return vzalloc(size);
 406}
 407EXPORT_SYMBOL(vzalloc_node);
 408
 409#ifndef PAGE_KERNEL_EXEC
 410# define PAGE_KERNEL_EXEC PAGE_KERNEL
 411#endif
 412
 413/**
 414 *      vmalloc_exec  -  allocate virtually contiguous, executable memory
 415 *      @size:          allocation size
 416 *
 417 *      Kernel-internal function to allocate enough pages to cover @size
 418 *      the page level allocator and map them into contiguous and
 419 *      executable kernel virtual space.
 420 *
 421 *      For tight control over page level allocator and protection flags
 422 *      use __vmalloc() instead.
 423 */
 424
 425void *vmalloc_exec(unsigned long size)
 426{
 427        return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC);
 428}
 429
 430/**
 431 * vmalloc_32  -  allocate virtually contiguous memory (32bit addressable)
 432 *      @size:          allocation size
 433 *
 434 *      Allocate enough 32bit PA addressable pages to cover @size from the
 435 *      page level allocator and map them into continguos kernel virtual space.
 436 */
 437void *vmalloc_32(unsigned long size)
 438{
 439        return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL);
 440}
 441EXPORT_SYMBOL(vmalloc_32);
 442
 443/**
 444 * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
 445 *      @size:          allocation size
 446 *
 447 * The resulting memory area is 32bit addressable and zeroed so it can be
 448 * mapped to userspace without leaking data.
 449 *
 450 * VM_USERMAP is set on the corresponding VMA so that subsequent calls to
 451 * remap_vmalloc_range() are permissible.
 452 */
 453void *vmalloc_32_user(unsigned long size)
 454{
 455        /*
 456         * We'll have to sort out the ZONE_DMA bits for 64-bit,
 457         * but for now this can simply use vmalloc_user() directly.
 458         */
 459        return vmalloc_user(size);
 460}
 461EXPORT_SYMBOL(vmalloc_32_user);
 462
 463void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_t prot)
 464{
 465        BUG();
 466        return NULL;
 467}
 468EXPORT_SYMBOL(vmap);
 469
 470void vunmap(const void *addr)
 471{
 472        BUG();
 473}
 474EXPORT_SYMBOL(vunmap);
 475
 476void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot)
 477{
 478        BUG();
 479        return NULL;
 480}
 481EXPORT_SYMBOL(vm_map_ram);
 482
 483void vm_unmap_ram(const void *mem, unsigned int count)
 484{
 485        BUG();
 486}
 487EXPORT_SYMBOL(vm_unmap_ram);
 488
 489void vm_unmap_aliases(void)
 490{
 491}
 492EXPORT_SYMBOL_GPL(vm_unmap_aliases);
 493
 494/*
 495 * Implement a stub for vmalloc_sync_all() if the architecture chose not to
 496 * have one.
 497 */
 498void  __attribute__((weak)) vmalloc_sync_all(void)
 499{
 500}
 501
 502/**
 503 *      alloc_vm_area - allocate a range of kernel address space
 504 *      @size:          size of the area
 505 *
 506 *      Returns:        NULL on failure, vm_struct on success
 507 *
 508 *      This function reserves a range of kernel address space, and
 509 *      allocates pagetables to map that range.  No actual mappings
 510 *      are created.  If the kernel address space is not shared
 511 *      between processes, it syncs the pagetable across all
 512 *      processes.
 513 */
 514struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes)
 515{
 516        BUG();
 517        return NULL;
 518}
 519EXPORT_SYMBOL_GPL(alloc_vm_area);
 520
 521void free_vm_area(struct vm_struct *area)
 522{
 523        BUG();
 524}
 525EXPORT_SYMBOL_GPL(free_vm_area);
 526
 527int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
 528                   struct page *page)
 529{
 530        return -EINVAL;
 531}
 532EXPORT_SYMBOL(vm_insert_page);
 533
 534/*
 535 *  sys_brk() for the most part doesn't need the global kernel
 536 *  lock, except when an application is doing something nasty
 537 *  like trying to un-brk an area that has already been mapped
 538 *  to a regular file.  in this case, the unmapping will need
 539 *  to invoke file system routines that need the global lock.
 540 */
 541SYSCALL_DEFINE1(brk, unsigned long, brk)
 542{
 543        struct mm_struct *mm = current->mm;
 544
 545        if (brk < mm->start_brk || brk > mm->context.end_brk)
 546                return mm->brk;
 547
 548        if (mm->brk == brk)
 549                return mm->brk;
 550
 551        /*
 552         * Always allow shrinking brk
 553         */
 554        if (brk <= mm->brk) {
 555                mm->brk = brk;
 556                return brk;
 557        }
 558
 559        /*
 560         * Ok, looks good - let it rip.
 561         */
 562        flush_icache_range(mm->brk, brk);
 563        return mm->brk = brk;
 564}
 565
 566/*
 567 * initialise the VMA and region record slabs
 568 */
 569void __init mmap_init(void)
 570{
 571        int ret;
 572
 573        ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL);
 574        VM_BUG_ON(ret);
 575        vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC);
 576}
 577
 578/*
 579 * validate the region tree
 580 * - the caller must hold the region lock
 581 */
 582#ifdef CONFIG_DEBUG_NOMMU_REGIONS
 583static noinline void validate_nommu_regions(void)
 584{
 585        struct vm_region *region, *last;
 586        struct rb_node *p, *lastp;
 587
 588        lastp = rb_first(&nommu_region_tree);
 589        if (!lastp)
 590                return;
 591
 592        last = rb_entry(lastp, struct vm_region, vm_rb);
 593        BUG_ON(unlikely(last->vm_end <= last->vm_start));
 594        BUG_ON(unlikely(last->vm_top < last->vm_end));
 595
 596        while ((p = rb_next(lastp))) {
 597                region = rb_entry(p, struct vm_region, vm_rb);
 598                last = rb_entry(lastp, struct vm_region, vm_rb);
 599
 600                BUG_ON(unlikely(region->vm_end <= region->vm_start));
 601                BUG_ON(unlikely(region->vm_top < region->vm_end));
 602                BUG_ON(unlikely(region->vm_start < last->vm_top));
 603
 604                lastp = p;
 605        }
 606}
 607#else
 608static void validate_nommu_regions(void)
 609{
 610}
 611#endif
 612
 613/*
 614 * add a region into the global tree
 615 */
 616static void add_nommu_region(struct vm_region *region)
 617{
 618        struct vm_region *pregion;
 619        struct rb_node **p, *parent;
 620
 621        validate_nommu_regions();
 622
 623        parent = NULL;
 624        p = &nommu_region_tree.rb_node;
 625        while (*p) {
 626                parent = *p;
 627                pregion = rb_entry(parent, struct vm_region, vm_rb);
 628                if (region->vm_start < pregion->vm_start)
 629                        p = &(*p)->rb_left;
 630                else if (region->vm_start > pregion->vm_start)
 631                        p = &(*p)->rb_right;
 632                else if (pregion == region)
 633                        return;
 634                else
 635                        BUG();
 636        }
 637
 638        rb_link_node(&region->vm_rb, parent, p);
 639        rb_insert_color(&region->vm_rb, &nommu_region_tree);
 640
 641        validate_nommu_regions();
 642}
 643
 644/*
 645 * delete a region from the global tree
 646 */
 647static void delete_nommu_region(struct vm_region *region)
 648{
 649        BUG_ON(!nommu_region_tree.rb_node);
 650
 651        validate_nommu_regions();
 652        rb_erase(&region->vm_rb, &nommu_region_tree);
 653        validate_nommu_regions();
 654}
 655
 656/*
 657 * free a contiguous series of pages
 658 */
 659static void free_page_series(unsigned long from, unsigned long to)
 660{
 661        for (; from < to; from += PAGE_SIZE) {
 662                struct page *page = virt_to_page(from);
 663
 664                kdebug("- free %lx", from);
 665                atomic_long_dec(&mmap_pages_allocated);
 666                if (page_count(page) != 1)
 667                        kdebug("free page %p: refcount not one: %d",
 668                               page, page_count(page));
 669                put_page(page);
 670        }
 671}
 672
 673/*
 674 * release a reference to a region
 675 * - the caller must hold the region semaphore for writing, which this releases
 676 * - the region may not have been added to the tree yet, in which case vm_top
 677 *   will equal vm_start
 678 */
 679static void __put_nommu_region(struct vm_region *region)
 680        __releases(nommu_region_sem)
 681{
 682        kenter("%p{%d}", region, region->vm_usage);
 683
 684        BUG_ON(!nommu_region_tree.rb_node);
 685
 686        if (--region->vm_usage == 0) {
 687                if (region->vm_top > region->vm_start)
 688                        delete_nommu_region(region);
 689                up_write(&nommu_region_sem);
 690
 691                if (region->vm_file)
 692                        fput(region->vm_file);
 693
 694                /* IO memory and memory shared directly out of the pagecache
 695                 * from ramfs/tmpfs mustn't be released here */
 696                if (region->vm_flags & VM_MAPPED_COPY) {
 697                        kdebug("free series");
 698                        free_page_series(region->vm_start, region->vm_top);
 699                }
 700                kmem_cache_free(vm_region_jar, region);
 701        } else {
 702                up_write(&nommu_region_sem);
 703        }
 704}
 705
 706/*
 707 * release a reference to a region
 708 */
 709static void put_nommu_region(struct vm_region *region)
 710{
 711        down_write(&nommu_region_sem);
 712        __put_nommu_region(region);
 713}
 714
 715/*
 716 * update protection on a vma
 717 */
 718static void protect_vma(struct vm_area_struct *vma, unsigned long flags)
 719{
 720#ifdef CONFIG_MPU
 721        struct mm_struct *mm = vma->vm_mm;
 722        long start = vma->vm_start & PAGE_MASK;
 723        while (start < vma->vm_end) {
 724                protect_page(mm, start, flags);
 725                start += PAGE_SIZE;
 726        }
 727        update_protections(mm);
 728#endif
 729}
 730
 731/*
 732 * add a VMA into a process's mm_struct in the appropriate place in the list
 733 * and tree and add to the address space's page tree also if not an anonymous
 734 * page
 735 * - should be called with mm->mmap_sem held writelocked
 736 */
 737static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
 738{
 739        struct vm_area_struct *pvma, *prev;
 740        struct address_space *mapping;
 741        struct rb_node **p, *parent, *rb_prev;
 742
 743        kenter(",%p", vma);
 744
 745        BUG_ON(!vma->vm_region);
 746
 747        mm->map_count++;
 748        vma->vm_mm = mm;
 749
 750        protect_vma(vma, vma->vm_flags);
 751
 752        /* add the VMA to the mapping */
 753        if (vma->vm_file) {
 754                mapping = vma->vm_file->f_mapping;
 755
 756                mutex_lock(&mapping->i_mmap_mutex);
 757                flush_dcache_mmap_lock(mapping);
 758                vma_interval_tree_insert(vma, &mapping->i_mmap);
 759                flush_dcache_mmap_unlock(mapping);
 760                mutex_unlock(&mapping->i_mmap_mutex);
 761        }
 762
 763        /* add the VMA to the tree */
 764        parent = rb_prev = NULL;
 765        p = &mm->mm_rb.rb_node;
 766        while (*p) {
 767                parent = *p;
 768                pvma = rb_entry(parent, struct vm_area_struct, vm_rb);
 769
 770                /* sort by: start addr, end addr, VMA struct addr in that order
 771                 * (the latter is necessary as we may get identical VMAs) */
 772                if (vma->vm_start < pvma->vm_start)
 773                        p = &(*p)->rb_left;
 774                else if (vma->vm_start > pvma->vm_start) {
 775                        rb_prev = parent;
 776                        p = &(*p)->rb_right;
 777                } else if (vma->vm_end < pvma->vm_end)
 778                        p = &(*p)->rb_left;
 779                else if (vma->vm_end > pvma->vm_end) {
 780                        rb_prev = parent;
 781                        p = &(*p)->rb_right;
 782                } else if (vma < pvma)
 783                        p = &(*p)->rb_left;
 784                else if (vma > pvma) {
 785                        rb_prev = parent;
 786                        p = &(*p)->rb_right;
 787                } else
 788                        BUG();
 789        }
 790
 791        rb_link_node(&vma->vm_rb, parent, p);
 792        rb_insert_color(&vma->vm_rb, &mm->mm_rb);
 793
 794        /* add VMA to the VMA list also */
 795        prev = NULL;
 796        if (rb_prev)
 797                prev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
 798
 799        __vma_link_list(mm, vma, prev, parent);
 800}
 801
 802/*
 803 * delete a VMA from its owning mm_struct and address space
 804 */
 805static void delete_vma_from_mm(struct vm_area_struct *vma)
 806{
 807        struct address_space *mapping;
 808        struct mm_struct *mm = vma->vm_mm;
 809
 810        kenter("%p", vma);
 811
 812        protect_vma(vma, 0);
 813
 814        mm->map_count--;
 815        if (mm->mmap_cache == vma)
 816                mm->mmap_cache = NULL;
 817
 818        /* remove the VMA from the mapping */
 819        if (vma->vm_file) {
 820                mapping = vma->vm_file->f_mapping;
 821
 822                mutex_lock(&mapping->i_mmap_mutex);
 823                flush_dcache_mmap_lock(mapping);
 824                vma_interval_tree_remove(vma, &mapping->i_mmap);
 825                flush_dcache_mmap_unlock(mapping);
 826                mutex_unlock(&mapping->i_mmap_mutex);
 827        }
 828
 829        /* remove from the MM's tree and list */
 830        rb_erase(&vma->vm_rb, &mm->mm_rb);
 831
 832        if (vma->vm_prev)
 833                vma->vm_prev->vm_next = vma->vm_next;
 834        else
 835                mm->mmap = vma->vm_next;
 836
 837        if (vma->vm_next)
 838                vma->vm_next->vm_prev = vma->vm_prev;
 839}
 840
 841/*
 842 * destroy a VMA record
 843 */
 844static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
 845{
 846        kenter("%p", vma);
 847        if (vma->vm_ops && vma->vm_ops->close)
 848                vma->vm_ops->close(vma);
 849        if (vma->vm_file)
 850                fput(vma->vm_file);
 851        put_nommu_region(vma->vm_region);
 852        kmem_cache_free(vm_area_cachep, vma);
 853}
 854
 855/*
 856 * look up the first VMA in which addr resides, NULL if none
 857 * - should be called with mm->mmap_sem at least held readlocked
 858 */
 859struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
 860{
 861        struct vm_area_struct *vma;
 862
 863        /* check the cache first */
 864        vma = ACCESS_ONCE(mm->mmap_cache);
 865        if (vma && vma->vm_start <= addr && vma->vm_end > addr)
 866                return vma;
 867
 868        /* trawl the list (there may be multiple mappings in which addr
 869         * resides) */
 870        for (vma = mm->mmap; vma; vma = vma->vm_next) {
 871                if (vma->vm_start > addr)
 872                        return NULL;
 873                if (vma->vm_end > addr) {
 874                        mm->mmap_cache = vma;
 875                        return vma;
 876                }
 877        }
 878
 879        return NULL;
 880}
 881EXPORT_SYMBOL(find_vma);
 882
 883/*
 884 * find a VMA
 885 * - we don't extend stack VMAs under NOMMU conditions
 886 */
 887struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
 888{
 889        return find_vma(mm, addr);
 890}
 891
 892/*
 893 * expand a stack to a given address
 894 * - not supported under NOMMU conditions
 895 */
 896int expand_stack(struct vm_area_struct *vma, unsigned long address)
 897{
 898        return -ENOMEM;
 899}
 900
 901/*
 902 * look up the first VMA exactly that exactly matches addr
 903 * - should be called with mm->mmap_sem at least held readlocked
 904 */
 905static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
 906                                             unsigned long addr,
 907                                             unsigned long len)
 908{
 909        struct vm_area_struct *vma;
 910        unsigned long end = addr + len;
 911
 912        /* check the cache first */
 913        vma = mm->mmap_cache;
 914        if (vma && vma->vm_start == addr && vma->vm_end == end)
 915                return vma;
 916
 917        /* trawl the list (there may be multiple mappings in which addr
 918         * resides) */
 919        for (vma = mm->mmap; vma; vma = vma->vm_next) {
 920                if (vma->vm_start < addr)
 921                        continue;
 922                if (vma->vm_start > addr)
 923                        return NULL;
 924                if (vma->vm_end == end) {
 925                        mm->mmap_cache = vma;
 926                        return vma;
 927                }
 928        }
 929
 930        return NULL;
 931}
 932
 933/*
 934 * determine whether a mapping should be permitted and, if so, what sort of
 935 * mapping we're capable of supporting
 936 */
 937static int validate_mmap_request(struct file *file,
 938                                 unsigned long addr,
 939                                 unsigned long len,
 940                                 unsigned long prot,
 941                                 unsigned long flags,
 942                                 unsigned long pgoff,
 943                                 unsigned long *_capabilities)
 944{
 945        unsigned long capabilities, rlen;
 946        int ret;
 947
 948        /* do the simple checks first */
 949        if (flags & MAP_FIXED) {
 950                printk(KERN_DEBUG
 951                       "%d: Can't do fixed-address/overlay mmap of RAM\n",
 952                       current->pid);
 953                return -EINVAL;
 954        }
 955
 956        if ((flags & MAP_TYPE) != MAP_PRIVATE &&
 957            (flags & MAP_TYPE) != MAP_SHARED)
 958                return -EINVAL;
 959
 960        if (!len)
 961                return -EINVAL;
 962
 963        /* Careful about overflows.. */
 964        rlen = PAGE_ALIGN(len);
 965        if (!rlen || rlen > TASK_SIZE)
 966                return -ENOMEM;
 967
 968        /* offset overflow? */
 969        if ((pgoff + (rlen >> PAGE_SHIFT)) < pgoff)
 970                return -EOVERFLOW;
 971
 972        if (file) {
 973                /* validate file mapping requests */
 974                struct address_space *mapping;
 975
 976                /* files must support mmap */
 977                if (!file->f_op || !file->f_op->mmap)
 978                        return -ENODEV;
 979
 980                /* work out if what we've got could possibly be shared
 981                 * - we support chardevs that provide their own "memory"
 982                 * - we support files/blockdevs that are memory backed
 983                 */
 984                mapping = file->f_mapping;
 985                if (!mapping)
 986                        mapping = file_inode(file)->i_mapping;
 987
 988                capabilities = 0;
 989                if (mapping && mapping->backing_dev_info)
 990                        capabilities = mapping->backing_dev_info->capabilities;
 991
 992                if (!capabilities) {
 993                        /* no explicit capabilities set, so assume some
 994                         * defaults */
 995                        switch (file_inode(file)->i_mode & S_IFMT) {
 996                        case S_IFREG:
 997                        case S_IFBLK:
 998                                capabilities = BDI_CAP_MAP_COPY;
 999                                break;
1000

1001                        case S_IFCHR:
1002                                capabilities =
1003                                        BDI_CAP_MAP_DIRECT |
1004                                        BDI_CAP_READ_MAP |
1005                                        BDI_CAP_WRITE_MAP;
1006                                break;
1007
1008                        default:
1009                                return -EINVAL;
1010                        }
1011                }
1012
1013                /* eliminate any capabilities that we can't support on this
1014                 * device */
1015                if (!file->f_op->get_unmapped_area)
1016                        capabilities &= ~BDI_CAP_MAP_DIRECT;
1017                if (!file->f_op->read)
1018                        capabilities &= ~BDI_CAP_MAP_COPY;
1019
1020                /* The file shall have been opened with read permission. */
1021                if (!(file->f_mode & FMODE_READ))
1022                        return -EACCES;
1023
1024                if (flags & MAP_SHARED) {
1025                        /* do checks for writing, appending and locking */
1026                        if ((prot & PROT_WRITE) &&
1027                            !(file->f_mode & FMODE_WRITE))
1028                                return -EACCES;
1029
1030                        if (IS_APPEND(file_inode(file)) &&
1031                            (file->f_mode & FMODE_WRITE))
1032                                return -EACCES;
1033
1034                        if (locks_verify_locked(file))
1035                                return -EAGAIN;
1036
1037                        if (!(capabilities & BDI_CAP_MAP_DIRECT))
1038                                return -ENODEV;
1039
1040                        /* we mustn't privatise shared mappings */
1041                        capabilities &= ~BDI_CAP_MAP_COPY;
1042                }
1043                else {
1044                        /* we're going to read the file into private memory we
1045                         * allocate */
1046                        if (!(capabilities & BDI_CAP_MAP_COPY))
1047                                return -ENODEV;
1048
1049                        /* we don't permit a private writable mapping to be
1050                         * shared with the backing device */
1051                        if (prot & PROT_WRITE)
1052                                capabilities &= ~BDI_CAP_MAP_DIRECT;
1053                }
1054
1055                if (capabilities & BDI_CAP_MAP_DIRECT) {
1056                        if (((prot & PROT_READ)  && !(capabilities & BDI_CAP_READ_MAP))  ||
1057                            ((prot & PROT_WRITE) && !(capabilities & BDI_CAP_WRITE_MAP)) ||
1058                            ((prot & PROT_EXEC)  && !(capabilities & BDI_CAP_EXEC_MAP))
1059                            ) {
1060                                capabilities &= ~BDI_CAP_MAP_DIRECT;
1061                                if (flags & MAP_SHARED) {
1062                                        printk(KERN_WARNING
1063                                               "MAP_SHARED not completely supported on !MMU\n");
1064                                        return -EINVAL;
1065                                }
1066                        }
1067                }
1068
1069                /* handle executable mappings and implied executable
1070                 * mappings */
1071                if (path_noexec(&file->f_path)) {
1072                        if (prot & PROT_EXEC)
1073                                return -EPERM;
1074                }
1075                else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) {
1076                        /* handle implication of PROT_EXEC by PROT_READ */
1077                        if (current->personality & READ_IMPLIES_EXEC) {
1078                                if (capabilities & BDI_CAP_EXEC_MAP)
1079                                        prot |= PROT_EXEC;
1080                        }
1081                }
1082                else if ((prot & PROT_READ) &&
1083                         (prot & PROT_EXEC) &&
1084                         !(capabilities & BDI_CAP_EXEC_MAP)
1085                         ) {
1086                        /* backing file is not executable, try to copy */
1087                        capabilities &= ~BDI_CAP_MAP_DIRECT;
1088                }
1089        }
1090        else {
1091                /* anonymous mappings are always memory backed and can be
1092                 * privately mapped
1093                 */
1094                capabilities = BDI_CAP_MAP_COPY;
1095
1096                /* handle PROT_EXEC implication by PROT_READ */
1097                if ((prot & PROT_READ) &&
1098                    (current->personality & READ_IMPLIES_EXEC))
1099                        prot |= PROT_EXEC;
1100        }
1101
1102        /* allow the security API to have its say */
1103        ret = security_mmap_addr(addr);
1104        if (ret < 0)
1105                return ret;
1106
1107        /* looks okay */
1108        *_capabilities = capabilities;
1109        return 0;
1110}
1111
1112/*
1113 * we've determined that we can make the mapping, now translate what we
1114 * now know into VMA flags
1115 */
1116static unsigned long determine_vm_flags(struct file *file,
1117                                        unsigned long prot,
1118                                        unsigned long flags,
1119                                        unsigned long capabilities)
1120{
1121        unsigned long vm_flags;
1122
1123        vm_flags = calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(flags);
1124        /* vm_flags |= mm->def_flags; */
1125
1126        if (!(capabilities & BDI_CAP_MAP_DIRECT)) {
1127                /* attempt to share read-only copies of mapped file chunks */
1128                vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
1129                if (file && !(prot & PROT_WRITE))
1130                        vm_flags |= VM_MAYSHARE;
1131        } else {
1132                /* overlay a shareable mapping on the backing device or inode
1133                 * if possible - used for chardevs, ramfs/tmpfs/shmfs and
1134                 * romfs/cramfs */
1135                vm_flags |= VM_MAYSHARE | (capabilities & BDI_CAP_VMFLAGS);
1136                if (flags & MAP_SHARED)
1137                        vm_flags |= VM_SHARED;
1138        }
1139
1140        /* refuse to let anyone share private mappings with this process if
1141         * it's being traced - otherwise breakpoints set in it may interfere
1142         * with another untraced process
1143         */
1144        if ((flags & MAP_PRIVATE) && current->ptrace)
1145                vm_flags &= ~VM_MAYSHARE;
1146
1147        return vm_flags;
1148}
1149
1150/*
1151 * set up a shared mapping on a file (the driver or filesystem provides and
1152 * pins the storage)
1153 */
1154static int do_mmap_shared_file(struct vm_area_struct *vma)
1155{
1156        int ret;
1157
1158        ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
1159        if (ret == 0) {
1160                vma->vm_region->vm_top = vma->vm_region->vm_end;
1161                return 0;
1162        }
1163        if (ret != -ENOSYS)
1164                return ret;
1165
1166        /* getting -ENOSYS indicates that direct mmap isn't possible (as
1167         * opposed to tried but failed) so we can only give a suitable error as
1168         * it's not possible to make a private copy if MAP_SHARED was given */
1169        return -ENODEV;
1170}
1171
1172/*
1173 * set up a private mapping or an anonymous shared mapping
1174 */
1175static int do_mmap_private(struct vm_area_struct *vma,
1176                           struct vm_region *region,
1177                           unsigned long len,
1178                           unsigned long capabilities)
1179{
1180        struct page *pages;
1181        unsigned long total, point, n;
1182        void *base;
1183        int ret, order;
1184
1185        /* invoke the file's mapping function so that it can keep track of
1186         * shared mappings on devices or memory
1187         * - VM_MAYSHARE will be set if it may attempt to share
1188         */
1189        if (capabilities & BDI_CAP_MAP_DIRECT) {
1190                ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
1191                if (ret == 0) {
1192                        /* shouldn't return success if we're not sharing */
1193                        BUG_ON(!(vma->vm_flags & VM_MAYSHARE));
1194                        vma->vm_region->vm_top = vma->vm_region->vm_end;
1195                        return 0;
1196                }
1197                if (ret != -ENOSYS)
1198                        return ret;
1199
1200                /* getting an ENOSYS error indicates that direct mmap isn't
1201                 * possible (as opposed to tried but failed) so we'll try to
1202                 * make a private copy of the data and map that instead */
1203        }
1204
1205
1206        /* allocate some memory to hold the mapping
1207         * - note that this may not return a page-aligned address if the object
1208         *   we're allocating is smaller than a page
1209         */
1210        order = get_order(len);
1211        kdebug("alloc order %d for %lx", order, len);
1212
1213        pages = alloc_pages(GFP_KERNEL, order);
1214        if (!pages)
1215                goto enomem;
1216
1217        total = 1 << order;
1218        atomic_long_add(total, &mmap_pages_allocated);
1219
1220        point = len >> PAGE_SHIFT;
1221
1222        /* we allocated a power-of-2 sized page set, so we may want to trim off
1223         * the excess */
1224        if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages) {
1225                while (total > point) {
1226                        order = ilog2(total - point);
1227                        n = 1 << order;
1228                        kdebug("shave %lu/%lu @%lu", n, total - point, total);
1229                        atomic_long_sub(n, &mmap_pages_allocated);
1230                        total -= n;
1231                        set_page_refcounted(pages + total);
1232                        __free_pages(pages + total, order);
1233                }
1234        }
1235
1236        for (point = 1; point < total; point++)
1237                set_page_refcounted(&pages[point]);
1238
1239        base = page_address(pages);
1240        region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY;
1241        region->vm_start = (unsigned long) base;
1242        region->vm_end   = region->vm_start + len;
1243        region->vm_top   = region->vm_start + (total << PAGE_SHIFT);
1244
1245        vma->vm_start = region->vm_start;
1246        vma->vm_end   = region->vm_start + len;
1247
1248        if (vma->vm_file) {
1249                /* read the contents of a file into the copy */
1250                mm_segment_t old_fs;
1251                loff_t fpos;
1252
1253                fpos = vma->vm_pgoff;
1254                fpos <<= PAGE_SHIFT;
1255
1256                old_fs = get_fs();
1257                set_fs(KERNEL_DS);
1258                ret = vma->vm_file->f_op->read(vma->vm_file, base, len, &fpos);
1259                set_fs(old_fs);
1260
1261                if (ret < 0)
1262                        goto error_free;
1263
1264                /* clear the last little bit */
1265                if (ret < len)
1266                        memset(base + ret, 0, len - ret);
1267
1268        }
1269
1270        return 0;
1271
1272error_free:
1273        free_page_series(region->vm_start, region->vm_top);
1274        region->vm_start = vma->vm_start = 0;
1275        region->vm_end   = vma->vm_end = 0;
1276        region->vm_top   = 0;
1277        return ret;
1278
1279enomem:
1280        printk("Allocation of length %lu from process %d (%s) failed\n",
1281               len, current->pid, current->comm);
1282        show_free_areas(0);
1283        return -ENOMEM;
1284}
1285
1286/*
1287 * handle mapping creation for uClinux
1288 */
1289unsigned long do_mmap(struct file *file,
1290                        unsigned long addr,
1291                        unsigned long len,
1292                        unsigned long prot,
1293                        unsigned long flags,
1294                        vm_flags_t vm_flags,
1295                        unsigned long pgoff,
1296                        unsigned long *populate,
1297                        struct list_head *uf_unused)
1298{
1299        struct vm_area_struct *vma;
1300        struct vm_region *region;
1301        struct rb_node *rb;
1302        unsigned long capabilities, result;
1303        int ret;
1304
1305        kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff);
1306
1307        *populate = 0;
1308
1309        /* decide whether we should attempt the mapping, and if so what sort of
1310         * mapping */
1311        ret = validate_mmap_request(file, addr, len, prot, flags, pgoff,
1312                                    &capabilities);
1313        if (ret < 0) {
1314                kleave(" = %d [val]", ret);
1315                return ret;
1316        }
1317
1318        /* we ignore the address hint */
1319        addr = 0;
1320        len = PAGE_ALIGN(len);
1321
1322        /* we've determined that we can make the mapping, now translate what we
1323         * now know into VMA flags */
1324        vm_flags |= determine_vm_flags(file, prot, flags, capabilities);
1325
1326        /* we're going to need to record the mapping */
1327        region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL);
1328        if (!region)
1329                goto error_getting_region;
1330
1331        vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
1332        if (!vma)
1333                goto error_getting_vma;
1334
1335        region->vm_usage = 1;
1336        region->vm_flags = vm_flags;
1337        region->vm_pgoff = pgoff;
1338
1339        INIT_LIST_HEAD(&vma->anon_vma_chain);
1340        vma->vm_flags = vm_flags;
1341        vma->vm_pgoff = pgoff;
1342
1343        if (file) {
1344                region->vm_file = get_file(file);
1345                vma->vm_file = get_file(file);
1346        }
1347
1348        down_write(&nommu_region_sem);
1349
1350        /* if we want to share, we need to check for regions created by other
1351         * mmap() calls that overlap with our proposed mapping
1352         * - we can only share with a superset match on most regular files
1353         * - shared mappings on character devices and memory backed files are
1354         *   permitted to overlap inexactly as far as we are concerned for in
1355         *   these cases, sharing is handled in the driver or filesystem rather
1356         *   than here
1357         */
1358        if (vm_flags & VM_MAYSHARE) {
1359                struct vm_region *pregion;
1360                unsigned long pglen, rpglen, pgend, rpgend, start;
1361
1362                pglen = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1363                pgend = pgoff + pglen;
1364
1365                for (rb = rb_first(&nommu_region_tree); rb; rb = rb_next(rb)) {
1366                        pregion = rb_entry(rb, struct vm_region, vm_rb);
1367
1368                        if (!(pregion->vm_flags & VM_MAYSHARE))
1369                                continue;
1370
1371                        /* search for overlapping mappings on the same file */
1372                        if (file_inode(pregion->vm_file) !=
1373                            file_inode(file))
1374                                continue;
1375
1376                        if (pregion->vm_pgoff >= pgend)
1377                                continue;
1378
1379                        rpglen = pregion->vm_end - pregion->vm_start;
1380                        rpglen = (rpglen + PAGE_SIZE - 1) >> PAGE_SHIFT;
1381                        rpgend = pregion->vm_pgoff + rpglen;
1382                        if (pgoff >= rpgend)
1383                                continue;
1384
1385                        /* handle inexactly overlapping matches between
1386                         * mappings */
1387                        if ((pregion->vm_pgoff != pgoff || rpglen != pglen) &&
1388                            !(pgoff >= pregion->vm_pgoff && pgend <= rpgend)) {
1389                                /* new mapping is not a subset of the region */
1390                                if (!(capabilities & BDI_CAP_MAP_DIRECT))
1391                                        goto sharing_violation;
1392                                continue;
1393                        }
1394
1395                        /* we've found a region we can share */
1396                        pregion->vm_usage++;
1397                        vma->vm_region = pregion;
1398                        start = pregion->vm_start;
1399                        start += (pgoff - pregion->vm_pgoff) << PAGE_SHIFT;
1400                        vma->vm_start = start;
1401                        vma->vm_end = start + len;
1402
1403                        if (pregion->vm_flags & VM_MAPPED_COPY) {
1404                                kdebug("share copy");
1405                                vma->vm_flags |= VM_MAPPED_COPY;
1406                        } else {
1407                                kdebug("share mmap");
1408                                ret = do_mmap_shared_file(vma);
1409                                if (ret < 0) {
1410                                        vma->vm_region = NULL;
1411                                        vma->vm_start = 0;
1412                                        vma->vm_end = 0;
1413                                        pregion->vm_usage--;
1414                                        pregion = NULL;
1415                                        goto error_just_free;
1416                                }
1417                        }
1418                        fput(region->vm_file);
1419                        kmem_cache_free(vm_region_jar, region);
1420                        region = pregion;
1421                        result = start;
1422                        goto share;
1423                }
1424
1425                /* obtain the address at which to make a shared mapping
1426                 * - this is the hook for quasi-memory character devices to
1427                 *   tell us the location of a shared mapping
1428                 */
1429                if (capabilities & BDI_CAP_MAP_DIRECT) {
1430                        addr = file->f_op->get_unmapped_area(file, addr, len,
1431                                                             pgoff, flags);
1432                        if (IS_ERR_VALUE(addr)) {
1433                                ret = addr;
1434                                if (ret != -ENOSYS)
1435                                        goto error_just_free;
1436
1437                                /* the driver refused to tell us where to site
1438                                 * the mapping so we'll have to attempt to copy
1439                                 * it */
1440                                ret = -ENODEV;
1441                                if (!(capabilities & BDI_CAP_MAP_COPY))
1442                                        goto error_just_free;
1443
1444                                capabilities &= ~BDI_CAP_MAP_DIRECT;
1445                        } else {
1446                                vma->vm_start = region->vm_start = addr;
1447                                vma->vm_end = region->vm_end = addr + len;
1448                        }
1449                }
1450        }
1451
1452        vma->vm_region = region;
1453
1454        /* set up the mapping
1455         * - the region is filled in if BDI_CAP_MAP_DIRECT is still set
1456         */
1457        if (file && vma->vm_flags & VM_SHARED)
1458                ret = do_mmap_shared_file(vma);
1459        else
1460                ret = do_mmap_private(vma, region, len, capabilities);
1461        if (ret < 0)
1462                goto error_just_free;
1463        add_nommu_region(region);
1464
1465        /* clear anonymous mappings that don't ask for uninitialized data */
1466        if (!vma->vm_file && !(flags & MAP_UNINITIALIZED))
1467                memset((void *)region->vm_start, 0,
1468                       region->vm_end - region->vm_start);
1469
1470        /* okay... we have a mapping; now we have to register it */
1471        result = vma->vm_start;
1472
1473        current->mm->total_vm += len >> PAGE_SHIFT;
1474
1475share:
1476        add_vma_to_mm(current->mm, vma);
1477
1478        /* we flush the region from the icache only when the first executable
1479         * mapping of it is made  */
1480        if (vma->vm_flags & VM_EXEC && !region->vm_icache_flushed) {
1481                flush_icache_range(region->vm_start, region->vm_end);
1482                region->vm_icache_flushed = true;
1483        }
1484
1485        up_write(&nommu_region_sem);
1486
1487        kleave(" = %lx", result);
1488        return result;
1489
1490error_just_free:
1491        up_write(&nommu_region_sem);
1492error:
1493        if (region->vm_file)
1494                fput(region->vm_file);
1495        kmem_cache_free(vm_region_jar, region);
1496        if (vma->vm_file)
1497                fput(vma->vm_file);
1498        kmem_cache_free(vm_area_cachep, vma);
1499        kleave(" = %d", ret);
1500        return ret;
1501
1502sharing_violation:
1503        up_write(&nommu_region_sem);
1504        printk(KERN_WARNING "Attempt to share mismatched mappings\n");
1505        ret = -EINVAL;
1506        goto error;
1507
1508error_getting_vma:
1509        kmem_cache_free(vm_region_jar, region);
1510        printk(KERN_WARNING "Allocation of vma for %lu byte allocation"
1511               " from process %d failed\n",
1512               len, current->pid);
1513        show_free_areas(0);
1514        return -ENOMEM;
1515
1516error_getting_region:
1517        printk(KERN_WARNING "Allocation of vm region for %lu byte allocation"
1518               " from process %d failed\n",
1519               len, current->pid);
1520        show_free_areas(0);
1521        return -ENOMEM;
1522}
1523
1524SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1525                unsigned long, prot, unsigned long, flags,
1526                unsigned long, fd, unsigned long, pgoff)
1527{
1528        struct file *file = NULL;
1529        unsigned long retval = -EBADF;
1530
1531        audit_mmap_fd(fd, flags);
1532        if (!(flags & MAP_ANONYMOUS)) {
1533                file = fget(fd);
1534                if (!file)
1535                        goto out;
1536        }
1537
1538        flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
1539
1540        retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
1541
1542        if (file)
1543                fput(file);
1544out:
1545        return retval;
1546}
1547
1548#ifdef __ARCH_WANT_SYS_OLD_MMAP
1549struct mmap_arg_struct {
1550        unsigned long addr;
1551        unsigned long len;
1552        unsigned long prot;
1553        unsigned long flags;
1554        unsigned long fd;
1555        unsigned long offset;
1556};
1557
1558SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
1559{
1560        struct mmap_arg_struct a;
1561
1562        if (copy_from_user(&a, arg, sizeof(a)))
1563                return -EFAULT;
1564        if (a.offset & ~PAGE_MASK)
1565                return -EINVAL;
1566
1567        return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
1568                              a.offset >> PAGE_SHIFT);
1569}
1570#endif /* __ARCH_WANT_SYS_OLD_MMAP */
1571
1572/*
1573 * split a vma into two pieces at address 'addr', a new vma is allocated either
1574 * for the first part or the tail.
1575 */
1576int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
1577              unsigned long addr, int new_below)
1578{
1579        struct vm_area_struct *new;
1580        struct vm_region *region;
1581        unsigned long npages;
1582
1583        kenter("");
1584
1585        /* we're only permitted to split anonymous regions (these should have
1586         * only a single usage on the region) */
1587        if (vma->vm_file)
1588                return -ENOMEM;
1589
1590        if (mm->map_count >= sysctl_max_map_count)
1591                return -ENOMEM;
1592
1593        region = kmem_cache_alloc(vm_region_jar, GFP_KERNEL);
1594        if (!region)
1595                return -ENOMEM;
1596
1597        new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
1598        if (!new) {
1599                kmem_cache_free(vm_region_jar, region);
1600                return -ENOMEM;
1601        }
1602
1603        /* most fields are the same, copy all, and then fixup */
1604        *new = *vma;
1605        *region = *vma->vm_region;
1606        new->vm_region = region;
1607
1608        npages = (addr - vma->vm_start) >> PAGE_SHIFT;
1609
1610        if (new_below) {
1611                region->vm_top = region->vm_end = new->vm_end = addr;
1612        } else {
1613                region->vm_start = new->vm_start = addr;
1614                region->vm_pgoff = new->vm_pgoff += npages;
1615        }
1616
1617        if (new->vm_ops && new->vm_ops->open)
1618                new->vm_ops->open(new);
1619
1620        delete_vma_from_mm(vma);
1621        down_write(&nommu_region_sem);
1622        delete_nommu_region(vma->vm_region);
1623        if (new_below) {
1624                vma->vm_region->vm_start = vma->vm_start = addr;
1625                vma->vm_region->vm_pgoff = vma->vm_pgoff += npages;
1626        } else {
1627                vma->vm_region->vm_end = vma->vm_end = addr;
1628                vma->vm_region->vm_top = addr;
1629        }
1630        add_nommu_region(vma->vm_region);
1631        add_nommu_region(new->vm_region);
1632        up_write(&nommu_region_sem);
1633        add_vma_to_mm(mm, vma);
1634        add_vma_to_mm(mm, new);
1635        return 0;
1636}
1637
1638/*
1639 * shrink a VMA by removing the specified chunk from either the beginning or
1640 * the end
1641 */
1642static int shrink_vma(struct mm_struct *mm,
1643                      struct vm_area_struct *vma,
1644                      unsigned long from, unsigned long to)
1645{
1646        struct vm_region *region;
1647
1648        kenter("");
1649
1650        /* adjust the VMA's pointers, which may reposition it in the MM's tree
1651         * and list */
1652        delete_vma_from_mm(vma);
1653        if (from > vma->vm_start)
1654                vma->vm_end = from;
1655        else
1656                vma->vm_start = to;
1657        add_vma_to_mm(mm, vma);
1658
1659        /* cut the backing region down to size */
1660        region = vma->vm_region;
1661        BUG_ON(region->vm_usage != 1);
1662
1663        down_write(&nommu_region_sem);
1664        delete_nommu_region(region);
1665        if (from > region->vm_start) {
1666                to = region->vm_top;
1667                region->vm_top = region->vm_end = from;
1668        } else {
1669                region->vm_start = to;
1670        }
1671        add_nommu_region(region);
1672        up_write(&nommu_region_sem);
1673
1674        free_page_series(from, to);
1675        return 0;
1676}
1677
1678/*
1679 * release a mapping
1680 * - under NOMMU conditions the chunk to be unmapped must be backed by a single
1681 *   VMA, though it need not cover the whole VMA
1682 */
1683int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf)
1684{
1685        struct vm_area_struct *vma;
1686        unsigned long end;
1687        int ret;
1688
1689        kenter(",%lx,%zx", start, len);
1690
1691        len = PAGE_ALIGN(len);
1692        if (len == 0)
1693                return -EINVAL;
1694
1695        end = start + len;
1696
1697        /* find the first potentially overlapping VMA */
1698        vma = find_vma(mm, start);
1699        if (!vma) {
1700                static int limit = 0;
1701                if (limit < 5) {
1702                        printk(KERN_WARNING
1703                               "munmap of memory not mmapped by process %d"
1704                               " (%s): 0x%lx-0x%lx\n",
1705                               current->pid, current->comm,
1706                               start, start + len - 1);
1707                        limit++;
1708                }
1709                return -EINVAL;
1710        }
1711
1712        /* we're allowed to split an anonymous VMA but not a file-backed one */
1713        if (vma->vm_file) {
1714                do {
1715                        if (start > vma->vm_start) {
1716                                kleave(" = -EINVAL [miss]");
1717                                return -EINVAL;
1718                        }
1719                        if (end == vma->vm_end)
1720                                goto erase_whole_vma;
1721                        vma = vma->vm_next;
1722                } while (vma);
1723                kleave(" = -EINVAL [split file]");
1724                return -EINVAL;
1725        } else {
1726                /* the chunk must be a subset of the VMA found */
1727                if (start == vma->vm_start && end == vma->vm_end)
1728                        goto erase_whole_vma;
1729                if (start < vma->vm_start || end > vma->vm_end) {
1730                        kleave(" = -EINVAL [superset]");
1731                        return -EINVAL;
1732                }
1733                if (start & ~PAGE_MASK) {
1734                        kleave(" = -EINVAL [unaligned start]");
1735                        return -EINVAL;
1736                }
1737                if (end != vma->vm_end && end & ~PAGE_MASK) {
1738                        kleave(" = -EINVAL [unaligned split]");
1739                        return -EINVAL;
1740                }
1741                if (start != vma->vm_start && end != vma->vm_end) {
1742                        ret = split_vma(mm, vma, start, 1);
1743                        if (ret < 0) {
1744                                kleave(" = %d [split]", ret);
1745                                return ret;
1746                        }
1747                }
1748                return shrink_vma(mm, vma, start, end);
1749        }
1750
1751erase_whole_vma:
1752        delete_vma_from_mm(vma);
1753        delete_vma(mm, vma);
1754        kleave(" = 0");
1755        return 0;
1756}
1757EXPORT_SYMBOL(do_munmap);
1758
1759int vm_munmap(unsigned long addr, size_t len)
1760{
1761        struct mm_struct *mm = current->mm;
1762        int ret;
1763
1764        down_write(&mm->mmap_sem);
1765        ret = do_munmap(mm, addr, len, NULL);
1766        up_write(&mm->mmap_sem);
1767        return ret;
1768}
1769EXPORT_SYMBOL(vm_munmap);
1770
1771SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
1772{
1773        return vm_munmap(addr, len);
1774}
1775
1776/*
1777 * release all the mappings made in a process's VM space
1778 */
1779void exit_mmap(struct mm_struct *mm)
1780{
1781        struct vm_area_struct *vma;
1782
1783        if (!mm)
1784                return;
1785
1786        kenter("");
1787
1788        mm->total_vm = 0;
1789
1790        while ((vma = mm->mmap)) {
1791                mm->mmap = vma->vm_next;
1792                delete_vma_from_mm(vma);
1793                delete_vma(mm, vma);
1794                cond_resched();
1795        }
1796
1797        kleave("");
1798}
1799
1800unsigned long vm_brk(unsigned long addr, unsigned long len)
1801{
1802        return -ENOMEM;
1803}
1804
1805/*
1806 * expand (or shrink) an existing mapping, potentially moving it at the same
1807 * time (controlled by the MREMAP_MAYMOVE flag and available VM space)
1808 *
1809 * under NOMMU conditions, we only permit changing a mapping's size, and only
1810 * as long as it stays within the region allocated by do_mmap_private() and the
1811 * block is not shareable
1812 *
1813 * MREMAP_FIXED is not supported under NOMMU conditions
1814 */
1815static unsigned long do_mremap(unsigned long addr,
1816                        unsigned long old_len, unsigned long new_len,
1817                        unsigned long flags, unsigned long new_addr)
1818{
1819        struct vm_area_struct *vma;
1820
1821        /* insanity checks first */
1822        old_len = PAGE_ALIGN(old_len);
1823        new_len = PAGE_ALIGN(new_len);
1824        if (old_len == 0 || new_len == 0)
1825                return (unsigned long) -EINVAL;
1826
1827        if (addr & ~PAGE_MASK)
1828                return -EINVAL;
1829
1830        if (flags & MREMAP_FIXED && new_addr != addr)
1831                return (unsigned long) -EINVAL;
1832
1833        vma = find_vma_exact(current->mm, addr, old_len);
1834        if (!vma)
1835                return (unsigned long) -EINVAL;
1836
1837        if (vma->vm_end != vma->vm_start + old_len)
1838                return (unsigned long) -EFAULT;
1839
1840        if (vma->vm_flags & VM_MAYSHARE)
1841                return (unsigned long) -EPERM;
1842
1843        if (new_len > vma->vm_region->vm_end - vma->vm_region->vm_start)
1844                return (unsigned long) -ENOMEM;
1845
1846        /* all checks complete - do it */
1847        vma->vm_end = vma->vm_start + new_len;
1848        return vma->vm_start;
1849}
1850
1851SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
1852                unsigned long, new_len, unsigned long, flags,
1853                unsigned long, new_addr)
1854{
1855        unsigned long ret;
1856
1857        down_write(&current->mm->mmap_sem);
1858        ret = do_mremap(addr, old_len, new_len, flags, new_addr);
1859        up_write(&current->mm->mmap_sem);
1860        return ret;
1861}
1862
1863struct page *follow_page_mask(struct vm_area_struct *vma,
1864                              unsigned long address, unsigned int flags,
1865                              unsigned int *page_mask)
1866{
1867        *page_mask = 0;
1868        return NULL;
1869}
1870
1871int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1872                unsigned long pfn, unsigned long size, pgprot_t prot)
1873{
1874        if (addr != (pfn << PAGE_SHIFT))
1875                return -EINVAL;
1876
1877        vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
1878        return 0;
1879}
1880EXPORT_SYMBOL(remap_pfn_range);
1881
1882int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
1883{
1884        unsigned long pfn = start >> PAGE_SHIFT;
1885        unsigned long vm_len = vma->vm_end - vma->vm_start;
1886
1887        pfn += vma->vm_pgoff;
1888        return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
1889}
1890EXPORT_SYMBOL(vm_iomap_memory);
1891
1892int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
1893                        unsigned long pgoff)
1894{
1895        unsigned int size = vma->vm_end - vma->vm_start;
1896
1897        if (!(vma->vm_flags & VM_USERMAP))
1898                return -EINVAL;
1899
1900        vma->vm_start = (unsigned long)(addr + (pgoff << PAGE_SHIFT));
1901        vma->vm_end = vma->vm_start + size;
1902
1903        return 0;
1904}
1905EXPORT_SYMBOL(remap_vmalloc_range);
1906
1907unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr,
1908        unsigned long len, unsigned long pgoff, unsigned long flags)
1909{
1910        return -ENOMEM;
1911}
1912
1913void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
1914{
1915}
1916
1917void unmap_mapping_range(struct address_space *mapping,
1918                         loff_t const holebegin, loff_t const holelen,
1919                         int even_cows)
1920{
1921}
1922EXPORT_SYMBOL(unmap_mapping_range);
1923
1924/*
1925 * Check that a process has enough memory to allocate a new virtual
1926 * mapping. 0 means there is enough memory for the allocation to
1927 * succeed and -ENOMEM implies there is not.
1928 *
1929 * We currently support three overcommit policies, which are set via the
1930 * vm.overcommit_memory sysctl.  See Documentation/vm/overcommit-accounting
1931 *
1932 * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
1933 * Additional code 2002 Jul 20 by Robert Love.
1934 *
1935 * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
1936 *
1937 * Note this is a helper function intended to be used by LSMs which
1938 * wish to use this logic.
1939 */
1940int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
1941{
1942        unsigned long free, allowed, reserve;
1943
1944        vm_acct_memory(pages);
1945
1946        /*
1947         * Sometimes we want to use more memory than we have
1948         */
1949        if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
1950                return 0;
1951
1952        if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
1953                free = global_page_state(NR_FREE_PAGES);
1954                free += global_page_state(NR_FILE_PAGES);
1955
1956                /*
1957                 * shmem pages shouldn't be counted as free in this
1958                 * case, they can't be purged, only swapped out, and
1959                 * that won't affect the overall amount of available
1960                 * memory in the system.
1961                 */
1962                free -= global_page_state(NR_SHMEM);
1963
1964                free += get_nr_swap_pages();
1965
1966                /*
1967                 * Any slabs which are created with the
1968                 * SLAB_RECLAIM_ACCOUNT flag claim to have contents
1969                 * which are reclaimable, under pressure.  The dentry
1970                 * cache and most inode caches should fall into this
1971                 */
1972                free += global_page_state(NR_SLAB_RECLAIMABLE);
1973
1974                /*
1975                 * Leave reserved pages. The pages are not for anonymous pages.
1976                 */
1977                if (free <= totalreserve_pages)
1978                        goto error;
1979                else
1980                        free -= totalreserve_pages;
1981
1982                /*
1983                 * Reserve some for root
1984                 */
1985                if (!cap_sys_admin)
1986                        free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
1987
1988                if (free > pages)
1989                        return 0;
1990
1991                goto error;
1992        }
1993
1994        allowed = vm_commit_limit();
1995        /*
1996         * Reserve some 3% for root
1997         */
1998        if (!cap_sys_admin)
1999                allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
2000

2001        /*
2002         * Don't let a single process grow so big a user can't recover
2003         */
2004        if (mm) {
2005                reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
2006                allowed -= min(mm->total_vm / 32, reserve);
2007        }
2008
2009        if (percpu_counter_read_positive(&vm_committed_as) < allowed)
2010                return 0;
2011
2012error:
2013        vm_unacct_memory(pages);
2014
2015        return -ENOMEM;
2016}
2017
2018int in_gate_area_no_mm(unsigned long addr)
2019{
2020        return 0;
2021}
2022
2023int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2024{
2025        BUG();
2026        return 0;
2027}
2028EXPORT_SYMBOL(filemap_fault);
2029
2030int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr,
2031                             unsigned long size, pgoff_t pgoff)
2032{
2033        BUG();
2034        return 0;
2035}
2036EXPORT_SYMBOL(generic_file_remap_pages);
2037
2038static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
2039                unsigned long addr, void *buf, int len, int write)
2040{
2041        struct vm_area_struct *vma;
2042
2043        down_read(&mm->mmap_sem);
2044
2045        /* the access must start within one of the target process's mappings */
2046        vma = find_vma(mm, addr);
2047        if (vma) {
2048                /* don't overrun this mapping */
2049                if (addr + len >= vma->vm_end)
2050                        len = vma->vm_end - addr;
2051
2052                /* only read or write mappings where it is permitted */
2053                if (write && vma->vm_flags & VM_MAYWRITE)
2054                        copy_to_user_page(vma, NULL, addr,
2055                                         (void *) addr, buf, len);
2056                else if (!write && vma->vm_flags & VM_MAYREAD)
2057                        copy_from_user_page(vma, NULL, addr,
2058                                            buf, (void *) addr, len);
2059                else
2060                        len = 0;
2061        } else {
2062                len = 0;
2063        }
2064
2065        up_read(&mm->mmap_sem);
2066
2067        return len;
2068}
2069
2070/**
2071 * @access_remote_vm - access another process' address space
2072 * @mm:         the mm_struct of the target address space
2073 * @addr:       start address to access
2074 * @buf:        source or destination buffer
2075 * @len:        number of bytes to transfer
2076 * @write:      whether the access is a write
2077 *
2078 * The caller must hold a reference on @mm.
2079 */
2080int access_remote_vm(struct mm_struct *mm, unsigned long addr,
2081                void *buf, int len, int write)
2082{
2083        return __access_remote_vm(NULL, mm, addr, buf, len, write);
2084}
2085
2086/*
2087 * Access another process' address space.
2088 * - source/target buffer must be kernel space
2089 */
2090int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
2091{
2092        struct mm_struct *mm;
2093
2094        if (addr + len < addr)
2095                return 0;
2096
2097        mm = get_task_mm(tsk);
2098        if (!mm)
2099                return 0;
2100
2101        len = __access_remote_vm(tsk, mm, addr, buf, len, write);
2102
2103        mmput(mm);
2104        return len;
2105}
2106
2107/**
2108 * nommu_shrink_inode_mappings - Shrink the shared mappings on an inode
2109 * @inode: The inode to check
2110 * @size: The current filesize of the inode
2111 * @newsize: The proposed filesize of the inode
2112 *
2113 * Check the shared mappings on an inode on behalf of a shrinking truncate to
2114 * make sure that that any outstanding VMAs aren't broken and then shrink the
2115 * vm_regions that extend that beyond so that do_mmap_pgoff() doesn't
2116 * automatically grant mappings that are too large.
2117 */
2118int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
2119                                size_t newsize)
2120{
2121        struct vm_area_struct *vma;
2122        struct vm_region *region;
2123        pgoff_t low, high;
2124        size_t r_size, r_top;
2125
2126        low = newsize >> PAGE_SHIFT;
2127        high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
2128
2129        down_write(&nommu_region_sem);
2130        mutex_lock(&inode->i_mapping->i_mmap_mutex);
2131
2132        /* search for VMAs that fall within the dead zone */
2133        vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, low, high) {
2134                /* found one - only interested if it's shared out of the page
2135                 * cache */
2136                if (vma->vm_flags & VM_SHARED) {
2137                        mutex_unlock(&inode->i_mapping->i_mmap_mutex);
2138                        up_write(&nommu_region_sem);
2139                        return -ETXTBSY; /* not quite true, but near enough */
2140                }
2141        }
2142
2143        /* reduce any regions that overlap the dead zone - if in existence,
2144         * these will be pointed to by VMAs that don't overlap the dead zone
2145         *
2146         * we don't check for any regions that start beyond the EOF as there
2147         * shouldn't be any
2148         */
2149        vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap,
2150                                  0, ULONG_MAX) {
2151                if (!(vma->vm_flags & VM_SHARED))
2152                        continue;
2153
2154                region = vma->vm_region;
2155                r_size = region->vm_top - region->vm_start;
2156                r_top = (region->vm_pgoff << PAGE_SHIFT) + r_size;
2157
2158                if (r_top > newsize) {
2159                        region->vm_top -= r_top - newsize;
2160                        if (region->vm_end > region->vm_top)
2161                                region->vm_end = region->vm_top;
2162                }
2163        }
2164
2165        mutex_unlock(&inode->i_mapping->i_mmap_mutex);
2166        up_write(&nommu_region_sem);
2167        return 0;
2168}
2169
2170/*
2171 * Initialise sysctl_user_reserve_kbytes.
2172 *
2173 * This is intended to prevent a user from starting a single memory hogging
2174 * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER
2175 * mode.
2176 *
2177 * The default value is min(3% of free memory, 128MB)
2178 * 128MB is enough to recover with sshd/login, bash, and top/kill.
2179 */
2180static int __meminit init_user_reserve(void)
2181{
2182        unsigned long free_kbytes;
2183
2184        free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
2185
2186        sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
2187        return 0;
2188}
2189module_init(init_user_reserve)
2190
2191/*
2192 * Initialise sysctl_admin_reserve_kbytes.
2193 *
2194 * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin
2195 * to log in and kill a memory hogging process.
2196 *
2197 * Systems with more than 256MB will reserve 8MB, enough to recover
2198 * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will
2199 * only reserve 3% of free pages by default.
2200 */
2201static int __meminit init_admin_reserve(void)
2202{
2203        unsigned long free_kbytes;
2204
2205        free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
2206
2207        sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
2208        return 0;
2209}
2210module_init(init_admin_reserve)
2211