LXR linux/mm/mmap.c

   1/*
   2 * mm/mmap.c
   3 *
   4 * Written by obz.
   5 *
   6 * Address space accounting code        <alan@lxorguk.ukuu.org.uk>
   7 */
   8
   9#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  10
  11#include <linux/kernel.h>
  12#include <linux/slab.h>
  13#include <linux/backing-dev.h>
  14#include <linux/mm.h>
  15#include <linux/vmacache.h>
  16#include <linux/shm.h>
  17#include <linux/mman.h>
  18#include <linux/pagemap.h>
  19#include <linux/swap.h>
  20#include <linux/syscalls.h>
  21#include <linux/capability.h>
  22#include <linux/init.h>
  23#include <linux/file.h>
  24#include <linux/fs.h>
  25#include <linux/personality.h>
  26#include <linux/security.h>
  27#include <linux/hugetlb.h>
  28#include <linux/shmem_fs.h>
  29#include <linux/profile.h>
  30#include <linux/export.h>
  31#include <linux/mount.h>
  32#include <linux/mempolicy.h>
  33#include <linux/rmap.h>
  34#include <linux/mmu_notifier.h>
  35#include <linux/mmdebug.h>
  36#include <linux/perf_event.h>
  37#include <linux/audit.h>
  38#include <linux/khugepaged.h>
  39#include <linux/uprobes.h>
  40#include <linux/rbtree_augmented.h>
  41#include <linux/notifier.h>
  42#include <linux/memory.h>
  43#include <linux/printk.h>
  44#include <linux/userfaultfd_k.h>
  45#include <linux/moduleparam.h>
  46#include <linux/pkeys.h>
  47
  48#include <linux/uaccess.h>
  49#include <asm/cacheflush.h>
  50#include <asm/tlb.h>
  51#include <asm/mmu_context.h>
  52
  53#include "internal.h"
  54
  55#ifndef arch_mmap_check
  56#define arch_mmap_check(addr, len, flags)       (0)
  57#endif
  58
  59#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
  60const int mmap_rnd_bits_min = CONFIG_ARCH_MMAP_RND_BITS_MIN;
  61const int mmap_rnd_bits_max = CONFIG_ARCH_MMAP_RND_BITS_MAX;
  62int mmap_rnd_bits __read_mostly = CONFIG_ARCH_MMAP_RND_BITS;
  63#endif
  64#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
  65const int mmap_rnd_compat_bits_min = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN;
  66const int mmap_rnd_compat_bits_max = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX;
  67int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS;
  68#endif
  69
  70static bool ignore_rlimit_data;
  71core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644);
  72
  73static void unmap_region(struct mm_struct *mm,
  74                struct vm_area_struct *vma, struct vm_area_struct *prev,
  75                unsigned long start, unsigned long end);
  76
  77/* description of effects of mapping type and prot in current implementation.
  78 * this is due to the limited x86 page protection hardware.  The expected
  79 * behavior is in parens:
  80 *
  81 * map_type     prot
  82 *              PROT_NONE       PROT_READ       PROT_WRITE      PROT_EXEC
  83 * MAP_SHARED   r: (no) no      r: (yes) yes    r: (no) yes     r: (no) yes
  84 *              w: (no) no      w: (no) no      w: (yes) yes    w: (no) no
  85 *              x: (no) no      x: (no) yes     x: (no) yes     x: (yes) yes
  86 *
  87 * MAP_PRIVATE  r: (no) no      r: (yes) yes    r: (no) yes     r: (no) yes
  88 *              w: (no) no      w: (no) no      w: (copy) copy  w: (no) no
  89 *              x: (no) no      x: (no) yes     x: (no) yes     x: (yes) yes
  90 *
  91 * On arm64, PROT_EXEC has the following behaviour for both MAP_SHARED and
  92 * MAP_PRIVATE:
  93 *                                                              r: (no) no
  94 *                                                              w: (no) no
  95 *                                                              x: (yes) yes
  96 */
  97pgprot_t protection_map[16] __ro_after_init = {
  98        __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111,
  99        __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111
 100};
 101
 102pgprot_t vm_get_page_prot(unsigned long vm_flags)
 103{
 104        return __pgprot(pgprot_val(protection_map[vm_flags &
 105                                (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]) |
 106                        pgprot_val(arch_vm_get_page_prot(vm_flags)));
 107}
 108EXPORT_SYMBOL(vm_get_page_prot);
 109
 110static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags)
 111{
 112        return pgprot_modify(oldprot, vm_get_page_prot(vm_flags));
 113}
 114
 115/* Update vma->vm_page_prot to reflect vma->vm_flags. */
 116void vma_set_page_prot(struct vm_area_struct *vma)
 117{
 118        unsigned long vm_flags = vma->vm_flags;
 119        pgprot_t vm_page_prot;
 120
 121        vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags);
 122        if (vma_wants_writenotify(vma, vm_page_prot)) {
 123                vm_flags &= ~VM_SHARED;
 124                vm_page_prot = vm_pgprot_modify(vm_page_prot, vm_flags);
 125        }
 126        /* remove_protection_ptes reads vma->vm_page_prot without mmap_sem */
 127        WRITE_ONCE(vma->vm_page_prot, vm_page_prot);
 128}
 129
 130/*
 131 * Requires inode->i_mapping->i_mmap_rwsem
 132 */
 133static void __remove_shared_vm_struct(struct vm_area_struct *vma,
 134                struct file *file, struct address_space *mapping)
 135{
 136        if (vma->vm_flags & VM_DENYWRITE)
 137                atomic_inc(&file_inode(file)->i_writecount);
 138        if (vma->vm_flags & VM_SHARED)
 139                mapping_unmap_writable(mapping);
 140
 141        flush_dcache_mmap_lock(mapping);
 142        vma_interval_tree_remove(vma, &mapping->i_mmap);
 143        flush_dcache_mmap_unlock(mapping);
 144}
 145
 146/*
 147 * Unlink a file-based vm structure from its interval tree, to hide
 148 * vma from rmap and vmtruncate before freeing its page tables.
 149 */
 150void unlink_file_vma(struct vm_area_struct *vma)
 151{
 152        struct file *file = vma->vm_file;
 153
 154        if (file) {
 155                struct address_space *mapping = file->f_mapping;
 156                i_mmap_lock_write(mapping);
 157                __remove_shared_vm_struct(vma, file, mapping);
 158                i_mmap_unlock_write(mapping);
 159        }
 160}
 161
 162/*
 163 * Close a vm structure and free it, returning the next.
 164 */
 165static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
 166{
 167        struct vm_area_struct *next = vma->vm_next;
 168
 169        might_sleep();
 170        if (vma->vm_ops && vma->vm_ops->close)
 171                vma->vm_ops->close(vma);
 172        if (vma->vm_file)
 173                fput(vma->vm_file);
 174        mpol_put(vma_policy(vma));
 175        kmem_cache_free(vm_area_cachep, vma);
 176        return next;
 177}
 178
 179static int do_brk(unsigned long addr, unsigned long len, struct list_head *uf);
 180
 181SYSCALL_DEFINE1(brk, unsigned long, brk)
 182{
 183        unsigned long retval;
 184        unsigned long newbrk, oldbrk;
 185        struct mm_struct *mm = current->mm;
 186        struct vm_area_struct *next;
 187        unsigned long min_brk;
 188        bool populate;
 189        LIST_HEAD(uf);
 190
 191        if (down_write_killable(&mm->mmap_sem))
 192                return -EINTR;
 193
 194#ifdef CONFIG_COMPAT_BRK
 195        /*
 196         * CONFIG_COMPAT_BRK can still be overridden by setting
 197         * randomize_va_space to 2, which will still cause mm->start_brk
 198         * to be arbitrarily shifted
 199         */
 200        if (current->brk_randomized)
 201                min_brk = mm->start_brk;
 202        else
 203                min_brk = mm->end_data;
 204#else
 205        min_brk = mm->start_brk;
 206#endif
 207        if (brk < min_brk)
 208                goto out;
 209
 210        /*
 211         * Check against rlimit here. If this check is done later after the test
 212         * of oldbrk with newbrk then it can escape the test and let the data
 213         * segment grow beyond its set limit the in case where the limit is
 214         * not page aligned -Ram Gupta
 215         */
 216        if (check_data_rlimit(rlimit(RLIMIT_DATA), brk, mm->start_brk,
 217                              mm->end_data, mm->start_data))
 218                goto out;
 219
 220        newbrk = PAGE_ALIGN(brk);
 221        oldbrk = PAGE_ALIGN(mm->brk);
 222        if (oldbrk == newbrk)
 223                goto set_brk;
 224
 225        /* Always allow shrinking brk. */
 226        if (brk <= mm->brk) {
 227                if (!do_munmap(mm, newbrk, oldbrk-newbrk, &uf))
 228                        goto set_brk;
 229                goto out;
 230        }
 231
 232        /* Check against existing mmap mappings. */
 233        next = find_vma(mm, oldbrk);
 234        if (next && newbrk + PAGE_SIZE > vm_start_gap(next))
 235                goto out;
 236
 237        /* Ok, looks good - let it rip. */
 238        if (do_brk(oldbrk, newbrk-oldbrk, &uf) < 0)
 239                goto out;
 240
 241set_brk:
 242        mm->brk = brk;
 243        populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0;
 244        up_write(&mm->mmap_sem);
 245        userfaultfd_unmap_complete(mm, &uf);
 246        if (populate)
 247                mm_populate(oldbrk, newbrk - oldbrk);
 248        return brk;
 249
 250out:
 251        retval = mm->brk;
 252        up_write(&mm->mmap_sem);
 253        return retval;
 254}
 255
 256static long vma_compute_subtree_gap(struct vm_area_struct *vma)
 257{
 258        unsigned long max, prev_end, subtree_gap;
 259
 260        /*
 261         * Note: in the rare case of a VM_GROWSDOWN above a VM_GROWSUP, we
 262         * allow two stack_guard_gaps between them here, and when choosing
 263         * an unmapped area; whereas when expanding we only require one.
 264         * That's a little inconsistent, but keeps the code here simpler.
 265         */
 266        max = vm_start_gap(vma);
 267        if (vma->vm_prev) {
 268                prev_end = vm_end_gap(vma->vm_prev);
 269                if (max > prev_end)
 270                        max -= prev_end;
 271                else
 272                        max = 0;
 273        }
 274        if (vma->vm_rb.rb_left) {
 275                subtree_gap = rb_entry(vma->vm_rb.rb_left,
 276                                struct vm_area_struct, vm_rb)->rb_subtree_gap;
 277                if (subtree_gap > max)
 278                        max = subtree_gap;
 279        }
 280        if (vma->vm_rb.rb_right) {
 281                subtree_gap = rb_entry(vma->vm_rb.rb_right,
 282                                struct vm_area_struct, vm_rb)->rb_subtree_gap;
 283                if (subtree_gap > max)
 284                        max = subtree_gap;
 285        }
 286        return max;
 287}
 288
 289#ifdef CONFIG_DEBUG_VM_RB
 290static int browse_rb(struct mm_struct *mm)
 291{
 292        struct rb_root *root = &mm->mm_rb;
 293        int i = 0, j, bug = 0;
 294        struct rb_node *nd, *pn = NULL;
 295        unsigned long prev = 0, pend = 0;
 296
 297        for (nd = rb_first(root); nd; nd = rb_next(nd)) {
 298                struct vm_area_struct *vma;
 299                vma = rb_entry(nd, struct vm_area_struct, vm_rb);
 300                if (vma->vm_start < prev) {
 301                        pr_emerg("vm_start %lx < prev %lx\n",
 302                                  vma->vm_start, prev);
 303                        bug = 1;
 304                }
 305                if (vma->vm_start < pend) {
 306                        pr_emerg("vm_start %lx < pend %lx\n",
 307                                  vma->vm_start, pend);
 308                        bug = 1;
 309                }
 310                if (vma->vm_start > vma->vm_end) {
 311                        pr_emerg("vm_start %lx > vm_end %lx\n",
 312                                  vma->vm_start, vma->vm_end);
 313                        bug = 1;
 314                }
 315                spin_lock(&mm->page_table_lock);
 316                if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) {
 317                        pr_emerg("free gap %lx, correct %lx\n",
 318                               vma->rb_subtree_gap,
 319                               vma_compute_subtree_gap(vma));
 320                        bug = 1;
 321                }
 322                spin_unlock(&mm->page_table_lock);
 323                i++;
 324                pn = nd;
 325                prev = vma->vm_start;
 326                pend = vma->vm_end;
 327        }
 328        j = 0;
 329        for (nd = pn; nd; nd = rb_prev(nd))
 330                j++;
 331        if (i != j) {
 332                pr_emerg("backwards %d, forwards %d\n", j, i);
 333                bug = 1;
 334        }
 335        return bug ? -1 : i;
 336}
 337
 338static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore)
 339{
 340        struct rb_node *nd;
 341
 342        for (nd = rb_first(root); nd; nd = rb_next(nd)) {
 343                struct vm_area_struct *vma;
 344                vma = rb_entry(nd, struct vm_area_struct, vm_rb);
 345                VM_BUG_ON_VMA(vma != ignore &&
 346                        vma->rb_subtree_gap != vma_compute_subtree_gap(vma),
 347                        vma);
 348        }
 349}
 350
 351static void validate_mm(struct mm_struct *mm)
 352{
 353        int bug = 0;
 354        int i = 0;
 355        unsigned long highest_address = 0;
 356        struct vm_area_struct *vma = mm->mmap;
 357
 358        while (vma) {
 359                struct anon_vma *anon_vma = vma->anon_vma;
 360                struct anon_vma_chain *avc;
 361
 362                if (anon_vma) {
 363                        anon_vma_lock_read(anon_vma);
 364                        list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
 365                                anon_vma_interval_tree_verify(avc);
 366                        anon_vma_unlock_read(anon_vma);
 367                }
 368
 369                highest_address = vm_end_gap(vma);
 370                vma = vma->vm_next;
 371                i++;
 372        }
 373        if (i != mm->map_count) {
 374                pr_emerg("map_count %d vm_next %d\n", mm->map_count, i);
 375                bug = 1;
 376        }
 377        if (highest_address != mm->highest_vm_end) {
 378                pr_emerg("mm->highest_vm_end %lx, found %lx\n",
 379                          mm->highest_vm_end, highest_address);
 380                bug = 1;
 381        }
 382        i = browse_rb(mm);
 383        if (i != mm->map_count) {
 384                if (i != -1)
 385                        pr_emerg("map_count %d rb %d\n", mm->map_count, i);
 386                bug = 1;
 387        }
 388        VM_BUG_ON_MM(bug, mm);
 389}
 390#else
 391#define validate_mm_rb(root, ignore) do { } while (0)
 392#define validate_mm(mm) do { } while (0)
 393#endif
 394
 395RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb,
 396                     unsigned long, rb_subtree_gap, vma_compute_subtree_gap)
 397
 398/*
 399 * Update augmented rbtree rb_subtree_gap values after vma->vm_start or
 400 * vma->vm_prev->vm_end values changed, without modifying the vma's position
 401 * in the rbtree.
 402 */
 403static void vma_gap_update(struct vm_area_struct *vma)
 404{
 405        /*
 406         * As it turns out, RB_DECLARE_CALLBACKS() already created a callback
 407         * function that does exacltly what we want.
 408         */
 409        vma_gap_callbacks_propagate(&vma->vm_rb, NULL);
 410}
 411
 412static inline void vma_rb_insert(struct vm_area_struct *vma,
 413                                 struct rb_root *root)
 414{
 415        /* All rb_subtree_gap values must be consistent prior to insertion */
 416        validate_mm_rb(root, NULL);
 417
 418        rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
 419}
 420
 421static void __vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
 422{
 423        /*
 424         * Note rb_erase_augmented is a fairly large inline function,
 425         * so make sure we instantiate it only once with our desired
 426         * augmented rbtree callbacks.
 427         */
 428        rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
 429}
 430
 431static __always_inline void vma_rb_erase_ignore(struct vm_area_struct *vma,
 432                                                struct rb_root *root,
 433                                                struct vm_area_struct *ignore)
 434{
 435        /*
 436         * All rb_subtree_gap values must be consistent prior to erase,
 437         * with the possible exception of the "next" vma being erased if
 438         * next->vm_start was reduced.
 439         */
 440        validate_mm_rb(root, ignore);
 441
 442        __vma_rb_erase(vma, root);
 443}
 444
 445static __always_inline void vma_rb_erase(struct vm_area_struct *vma,
 446                                         struct rb_root *root)
 447{
 448        /*
 449         * All rb_subtree_gap values must be consistent prior to erase,
 450         * with the possible exception of the vma being erased.
 451         */
 452        validate_mm_rb(root, vma);
 453
 454        __vma_rb_erase(vma, root);
 455}
 456
 457/*
 458 * vma has some anon_vma assigned, and is already inserted on that
 459 * anon_vma's interval trees.
 460 *
 461 * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the
 462 * vma must be removed from the anon_vma's interval trees using
 463 * anon_vma_interval_tree_pre_update_vma().
 464 *
 465 * After the update, the vma will be reinserted using
 466 * anon_vma_interval_tree_post_update_vma().
 467 *
 468 * The entire update must be protected by exclusive mmap_sem and by
 469 * the root anon_vma's mutex.
 470 */
 471static inline void
 472anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)
 473{
 474        struct anon_vma_chain *avc;
 475
 476        list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
 477                anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root);
 478}
 479
 480static inline void
 481anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
 482{
 483        struct anon_vma_chain *avc;
 484
 485        list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
 486                anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
 487}
 488
 489static int find_vma_links(struct mm_struct *mm, unsigned long addr,
 490                unsigned long end, struct vm_area_struct **pprev,
 491                struct rb_node ***rb_link, struct rb_node **rb_parent)
 492{
 493        struct rb_node **__rb_link, *__rb_parent, *rb_prev;
 494
 495        __rb_link = &mm->mm_rb.rb_node;
 496        rb_prev = __rb_parent = NULL;
 497
 498        while (*__rb_link) {
 499                struct vm_area_struct *vma_tmp;
 500
 501                __rb_parent = *__rb_link;
 502                vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);
 503
 504                if (vma_tmp->vm_end > addr) {
 505                        /* Fail if an existing vma overlaps the area */
 506                        if (vma_tmp->vm_start < end)
 507                                return -ENOMEM;
 508                        __rb_link = &__rb_parent->rb_left;
 509                } else {
 510                        rb_prev = __rb_parent;
 511                        __rb_link = &__rb_parent->rb_right;
 512                }
 513        }
 514
 515        *pprev = NULL;
 516        if (rb_prev)
 517                *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
 518        *rb_link = __rb_link;
 519        *rb_parent = __rb_parent;
 520        return 0;
 521}
 522
 523static unsigned long count_vma_pages_range(struct mm_struct *mm,
 524                unsigned long addr, unsigned long end)
 525{
 526        unsigned long nr_pages = 0;
 527        struct vm_area_struct *vma;
 528
 529        /* Find first overlaping mapping */
 530        vma = find_vma_intersection(mm, addr, end);
 531        if (!vma)
 532                return 0;
 533
 534        nr_pages = (min(end, vma->vm_end) -
 535                max(addr, vma->vm_start)) >> PAGE_SHIFT;
 536
 537        /* Iterate over the rest of the overlaps */
 538        for (vma = vma->vm_next; vma; vma = vma->vm_next) {
 539                unsigned long overlap_len;
 540
 541                if (vma->vm_start > end)
 542                        break;
 543
 544                overlap_len = min(end, vma->vm_end) - vma->vm_start;
 545                nr_pages += overlap_len >> PAGE_SHIFT;
 546        }
 547
 548        return nr_pages;
 549}
 550
 551void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
 552                struct rb_node **rb_link, struct rb_node *rb_parent)
 553{
 554        /* Update tracking information for the gap following the new vma. */
 555        if (vma->vm_next)
 556                vma_gap_update(vma->vm_next);
 557        else
 558                mm->highest_vm_end = vm_end_gap(vma);
 559
 560        /*
 561         * vma->vm_prev wasn't known when we followed the rbtree to find the
 562         * correct insertion point for that vma. As a result, we could not
 563         * update the vma vm_rb parents rb_subtree_gap values on the way down.
 564         * So, we first insert the vma with a zero rb_subtree_gap value
 565         * (to be consistent with what we did on the way down), and then
 566         * immediately update the gap to the correct value. Finally we
 567         * rebalance the rbtree after all augmented values have been set.
 568         */
 569        rb_link_node(&vma->vm_rb, rb_parent, rb_link);
 570        vma->rb_subtree_gap = 0;
 571        vma_gap_update(vma);
 572        vma_rb_insert(vma, &mm->mm_rb);
 573}
 574
 575static void __vma_link_file(struct vm_area_struct *vma)
 576{
 577        struct file *file;
 578
 579        file = vma->vm_file;
 580        if (file) {
 581                struct address_space *mapping = file->f_mapping;
 582
 583                if (vma->vm_flags & VM_DENYWRITE)
 584                        atomic_dec(&file_inode(file)->i_writecount);
 585                if (vma->vm_flags & VM_SHARED)
 586                        atomic_inc(&mapping->i_mmap_writable);
 587
 588                flush_dcache_mmap_lock(mapping);
 589                vma_interval_tree_insert(vma, &mapping->i_mmap);
 590                flush_dcache_mmap_unlock(mapping);
 591        }
 592}
 593
 594static void
 595__vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
 596        struct vm_area_struct *prev, struct rb_node **rb_link,
 597        struct rb_node *rb_parent)
 598{
 599        __vma_link_list(mm, vma, prev, rb_parent);
 600        __vma_link_rb(mm, vma, rb_link, rb_parent);
 601}
 602
 603static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
 604                        struct vm_area_struct *prev, struct rb_node **rb_link,
 605                        struct rb_node *rb_parent)
 606{
 607        struct address_space *mapping = NULL;
 608
 609        if (vma->vm_file) {
 610                mapping = vma->vm_file->f_mapping;
 611                i_mmap_lock_write(mapping);
 612        }
 613
 614        __vma_link(mm, vma, prev, rb_link, rb_parent);
 615        __vma_link_file(vma);
 616
 617        if (mapping)
 618                i_mmap_unlock_write(mapping);
 619
 620        mm->map_count++;
 621        validate_mm(mm);
 622}
 623
 624/*
 625 * Helper for vma_adjust() in the split_vma insert case: insert a vma into the
 626 * mm's list and rbtree.  It has already been inserted into the interval tree.
 627 */
 628static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
 629{
 630        struct vm_area_struct *prev;
 631        struct rb_node **rb_link, *rb_parent;
 632
 633        if (find_vma_links(mm, vma->vm_start, vma->vm_end,
 634                           &prev, &rb_link, &rb_parent))
 635                BUG();
 636        __vma_link(mm, vma, prev, rb_link, rb_parent);
 637        mm->map_count++;
 638}
 639
 640static __always_inline void __vma_unlink_common(struct mm_struct *mm,
 641                                                struct vm_area_struct *vma,
 642                                                struct vm_area_struct *prev,
 643                                                bool has_prev,
 644                                                struct vm_area_struct *ignore)
 645{
 646        struct vm_area_struct *next;
 647
 648        vma_rb_erase_ignore(vma, &mm->mm_rb, ignore);
 649        next = vma->vm_next;
 650        if (has_prev)
 651                prev->vm_next = next;
 652        else {
 653                prev = vma->vm_prev;
 654                if (prev)
 655                        prev->vm_next = next;
 656                else
 657                        mm->mmap = next;
 658        }
 659        if (next)
 660                next->vm_prev = prev;
 661
 662        /* Kill the cache */
 663        vmacache_invalidate(mm);
 664}
 665
 666static inline void __vma_unlink_prev(struct mm_struct *mm,
 667                                     struct vm_area_struct *vma,
 668                                     struct vm_area_struct *prev)
 669{
 670        __vma_unlink_common(mm, vma, prev, true, vma);
 671}
 672
 673/*
 674 * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that
 675 * is already present in an i_mmap tree without adjusting the tree.
 676 * The following helper function should be used when such adjustments
 677 * are necessary.  The "insert" vma (if any) is to be inserted
 678 * before we drop the necessary locks.
 679 */
 680int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
 681        unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert,
 682        struct vm_area_struct *expand)
 683{
 684        struct mm_struct *mm = vma->vm_mm;
 685        struct vm_area_struct *next = vma->vm_next, *orig_vma = vma;
 686        struct address_space *mapping = NULL;
 687        struct rb_root *root = NULL;
 688        struct anon_vma *anon_vma = NULL;
 689        struct file *file = vma->vm_file;
 690        bool start_changed = false, end_changed = false;
 691        long adjust_next = 0;
 692        int remove_next = 0;
 693
 694        if (next && !insert) {
 695                struct vm_area_struct *exporter = NULL, *importer = NULL;
 696
 697                if (end >= next->vm_end) {
 698                        /*
 699                         * vma expands, overlapping all the next, and
 700                         * perhaps the one after too (mprotect case 6).
 701                         * The only other cases that gets here are
 702                         * case 1, case 7 and case 8.
 703                         */
 704                        if (next == expand) {
 705                                /*
 706                                 * The only case where we don't expand "vma"
 707                                 * and we expand "next" instead is case 8.
 708                                 */
 709                                VM_WARN_ON(end != next->vm_end);
 710                                /*
 711                                 * remove_next == 3 means we're
 712                                 * removing "vma" and that to do so we
 713                                 * swapped "vma" and "next".
 714                                 */
 715                                remove_next = 3;
 716                                VM_WARN_ON(file != next->vm_file);
 717                                swap(vma, next);
 718                        } else {
 719                                VM_WARN_ON(expand != vma);
 720                                /*
 721                                 * case 1, 6, 7, remove_next == 2 is case 6,
 722                                 * remove_next == 1 is case 1 or 7.
 723                                 */
 724                                remove_next = 1 + (end > next->vm_end);
 725                                VM_WARN_ON(remove_next == 2 &&
 726                                           end != next->vm_next->vm_end);
 727                                VM_WARN_ON(remove_next == 1 &&
 728                                           end != next->vm_end);
 729                                /* trim end to next, for case 6 first pass */
 730                                end = next->vm_end;
 731                        }
 732
 733                        exporter = next;
 734                        importer = vma;
 735
 736                        /*
 737                         * If next doesn't have anon_vma, import from vma after
 738                         * next, if the vma overlaps with it.
 739                         */
 740                        if (remove_next == 2 && !next->anon_vma)
 741                                exporter = next->vm_next;
 742
 743                } else if (end > next->vm_start) {
 744                        /*
 745                         * vma expands, overlapping part of the next:
 746                         * mprotect case 5 shifting the boundary up.
 747                         */
 748                        adjust_next = (end - next->vm_start) >> PAGE_SHIFT;
 749                        exporter = next;
 750                        importer = vma;
 751                        VM_WARN_ON(expand != importer);
 752                } else if (end < vma->vm_end) {
 753                        /*
 754                         * vma shrinks, and !insert tells it's not
 755                         * split_vma inserting another: so it must be
 756                         * mprotect case 4 shifting the boundary down.
 757                         */
 758                        adjust_next = -((vma->vm_end - end) >> PAGE_SHIFT);
 759                        exporter = vma;
 760                        importer = next;
 761                        VM_WARN_ON(expand != importer);
 762                }
 763
 764                /*
 765                 * Easily overlooked: when mprotect shifts the boundary,
 766                 * make sure the expanding vma has anon_vma set if the
 767                 * shrinking vma had, to cover any anon pages imported.
 768                 */
 769                if (exporter && exporter->anon_vma && !importer->anon_vma) {
 770                        int error;
 771
 772                        importer->anon_vma = exporter->anon_vma;
 773                        error = anon_vma_clone(importer, exporter);
 774                        if (error)
 775                                return error;
 776                }
 777        }
 778again:
 779        vma_adjust_trans_huge(orig_vma, start, end, adjust_next);
 780
 781        if (file) {
 782                mapping = file->f_mapping;
 783                root = &mapping->i_mmap;
 784                uprobe_munmap(vma, vma->vm_start, vma->vm_end);
 785
 786                if (adjust_next)
 787                        uprobe_munmap(next, next->vm_start, next->vm_end);
 788
 789                i_mmap_lock_write(mapping);
 790                if (insert) {
 791                        /*
 792                         * Put into interval tree now, so instantiated pages
 793                         * are visible to arm/parisc __flush_dcache_page
 794                         * throughout; but we cannot insert into address
 795                         * space until vma start or end is updated.
 796                         */
 797                        __vma_link_file(insert);
 798                }
 799        }
 800
 801        anon_vma = vma->anon_vma;
 802        if (!anon_vma && adjust_next)
 803                anon_vma = next->anon_vma;
 804        if (anon_vma) {
 805                VM_WARN_ON(adjust_next && next->anon_vma &&
 806                           anon_vma != next->anon_vma);
 807                anon_vma_lock_write(anon_vma);
 808                anon_vma_interval_tree_pre_update_vma(vma);
 809                if (adjust_next)
 810                        anon_vma_interval_tree_pre_update_vma(next);
 811        }
 812
 813        if (root) {
 814                flush_dcache_mmap_lock(mapping);
 815                vma_interval_tree_remove(vma, root);
 816                if (adjust_next)
 817                        vma_interval_tree_remove(next, root);
 818        }
 819
 820        if (start != vma->vm_start) {
 821                vma->vm_start = start;
 822                start_changed = true;
 823        }
 824        if (end != vma->vm_end) {
 825                vma->vm_end = end;
 826                end_changed = true;
 827        }
 828        vma->vm_pgoff = pgoff;
 829        if (adjust_next) {
 830                next->vm_start += adjust_next << PAGE_SHIFT;
 831                next->vm_pgoff += adjust_next;
 832        }
 833
 834        if (root) {
 835                if (adjust_next)
 836                        vma_interval_tree_insert(next, root);
 837                vma_interval_tree_insert(vma, root);
 838                flush_dcache_mmap_unlock(mapping);
 839        }
 840
 841        if (remove_next) {
 842                /*
 843                 * vma_merge has merged next into vma, and needs
 844                 * us to remove next before dropping the locks.
 845                 */
 846                if (remove_next != 3)
 847                        __vma_unlink_prev(mm, next, vma);
 848                else
 849                        /*
 850                         * vma is not before next if they've been
 851                         * swapped.
 852                         *
 853                         * pre-swap() next->vm_start was reduced so
 854                         * tell validate_mm_rb to ignore pre-swap()
 855                         * "next" (which is stored in post-swap()
 856                         * "vma").
 857                         */
 858                        __vma_unlink_common(mm, next, NULL, false, vma);
 859                if (file)
 860                        __remove_shared_vm_struct(next, file, mapping);
 861        } else if (insert) {
 862                /*
 863                 * split_vma has split insert from vma, and needs
 864                 * us to insert it before dropping the locks
 865                 * (it may either follow vma or precede it).
 866                 */
 867                __insert_vm_struct(mm, insert);
 868        } else {
 869                if (start_changed)
 870                        vma_gap_update(vma);
 871                if (end_changed) {
 872                        if (!next)
 873                                mm->highest_vm_end = vm_end_gap(vma);
 874                        else if (!adjust_next)
 875                                vma_gap_update(next);
 876                }
 877        }
 878
 879        if (anon_vma) {
 880                anon_vma_interval_tree_post_update_vma(vma);
 881                if (adjust_next)
 882                        anon_vma_interval_tree_post_update_vma(next);
 883                anon_vma_unlock_write(anon_vma);
 884        }
 885        if (mapping)
 886                i_mmap_unlock_write(mapping);
 887
 888        if (root) {
 889                uprobe_mmap(vma);
 890
 891                if (adjust_next)
 892                        uprobe_mmap(next);
 893        }
 894
 895        if (remove_next) {
 896                if (file) {
 897                        uprobe_munmap(next, next->vm_start, next->vm_end);
 898                        fput(file);
 899                }
 900                if (next->anon_vma)
 901                        anon_vma_merge(vma, next);
 902                mm->map_count--;
 903                mpol_put(vma_policy(next));
 904                kmem_cache_free(vm_area_cachep, next);
 905                /*
 906                 * In mprotect's case 6 (see comments on vma_merge),
 907                 * we must remove another next too. It would clutter
 908                 * up the code too much to do both in one go.
 909                 */
 910                if (remove_next != 3) {
 911                        /*
 912                         * If "next" was removed and vma->vm_end was
 913                         * expanded (up) over it, in turn
 914                         * "next->vm_prev->vm_end" changed and the
 915                         * "vma->vm_next" gap must be updated.
 916                         */
 917                        next = vma->vm_next;
 918                } else {
 919                        /*
 920                         * For the scope of the comment "next" and
 921                         * "vma" considered pre-swap(): if "vma" was
 922                         * removed, next->vm_start was expanded (down)
 923                         * over it and the "next" gap must be updated.
 924                         * Because of the swap() the post-swap() "vma"
 925                         * actually points to pre-swap() "next"
 926                         * (post-swap() "next" as opposed is now a
 927                         * dangling pointer).
 928                         */
 929                        next = vma;
 930                }
 931                if (remove_next == 2) {
 932                        remove_next = 1;
 933                        end = next->vm_end;
 934                        goto again;
 935                }
 936                else if (next)
 937                        vma_gap_update(next);
 938                else {
 939                        /*
 940                         * If remove_next == 2 we obviously can't
 941                         * reach this path.
 942                         *
 943                         * If remove_next == 3 we can't reach this
 944                         * path because pre-swap() next is always not
 945                         * NULL. pre-swap() "next" is not being
 946                         * removed and its next->vm_end is not altered
 947                         * (and furthermore "end" already matches
 948                         * next->vm_end in remove_next == 3).
 949                         *
 950                         * We reach this only in the remove_next == 1
 951                         * case if the "next" vma that was removed was
 952                         * the highest vma of the mm. However in such
 953                         * case next->vm_end == "end" and the extended
 954                         * "vma" has vma->vm_end == next->vm_end so
 955                         * mm->highest_vm_end doesn't need any update
 956                         * in remove_next == 1 case.
 957                         */
 958                        VM_WARN_ON(mm->highest_vm_end != vm_end_gap(vma));
 959                }
 960        }
 961        if (insert && file)
 962                uprobe_mmap(insert);
 963
 964        validate_mm(mm);
 965
 966        return 0;
 967}
 968
 969/*
 970 * If the vma has a ->close operation then the driver probably needs to release
 971 * per-vma resources, so we don't attempt to merge those.
 972 */
 973static inline int is_mergeable_vma(struct vm_area_struct *vma,
 974                                struct file *file, unsigned long vm_flags,
 975                                struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
 976{
 977        /*
 978         * VM_SOFTDIRTY should not prevent from VMA merging, if we
 979         * match the flags but dirty bit -- the caller should mark
 980         * merged VMA as dirty. If dirty bit won't be excluded from
 981         * comparison, we increase pressue on the memory system forcing
 982         * the kernel to generate new VMAs when old one could be
 983         * extended instead.
 984         */
 985        if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY)
 986                return 0;
 987        if (vma->vm_file != file)
 988                return 0;
 989        if (vma->vm_ops && vma->vm_ops->close)
 990                return 0;
 991        if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx))
 992                return 0;
 993        return 1;
 994}
 995
 996static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
 997                                        struct anon_vma *anon_vma2,
 998                                        struct vm_area_struct *vma)
 999{
1000        /*

1001         * The list_is_singular() test is to avoid merging VMA cloned from
1002         * parents. This can improve scalability caused by anon_vma lock.
1003         */
1004        if ((!anon_vma1 || !anon_vma2) && (!vma ||
1005                list_is_singular(&vma->anon_vma_chain)))
1006                return 1;
1007        return anon_vma1 == anon_vma2;
1008}
1009
1010/*
1011 * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
1012 * in front of (at a lower virtual address and file offset than) the vma.
1013 *
1014 * We cannot merge two vmas if they have differently assigned (non-NULL)
1015 * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
1016 *
1017 * We don't check here for the merged mmap wrapping around the end of pagecache
1018 * indices (16TB on ia32) because do_mmap_pgoff() does not permit mmap's which
1019 * wrap, nor mmaps which cover the final page at index -1UL.
1020 */
1021static int
1022can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
1023                     struct anon_vma *anon_vma, struct file *file,
1024                     pgoff_t vm_pgoff,
1025                     struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
1026{
1027        if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) &&
1028            is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
1029                if (vma->vm_pgoff == vm_pgoff)
1030                        return 1;
1031        }
1032        return 0;
1033}
1034
1035/*
1036 * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
1037 * beyond (at a higher virtual address and file offset than) the vma.
1038 *
1039 * We cannot merge two vmas if they have differently assigned (non-NULL)
1040 * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
1041 */
1042static int
1043can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
1044                    struct anon_vma *anon_vma, struct file *file,
1045                    pgoff_t vm_pgoff,
1046                    struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
1047{
1048        if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) &&
1049            is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
1050                pgoff_t vm_pglen;
1051                vm_pglen = vma_pages(vma);
1052                if (vma->vm_pgoff + vm_pglen == vm_pgoff)
1053                        return 1;
1054        }
1055        return 0;
1056}
1057
1058/*
1059 * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out
1060 * whether that can be merged with its predecessor or its successor.
1061 * Or both (it neatly fills a hole).
1062 *
1063 * In most cases - when called for mmap, brk or mremap - [addr,end) is
1064 * certain not to be mapped by the time vma_merge is called; but when
1065 * called for mprotect, it is certain to be already mapped (either at
1066 * an offset within prev, or at the start of next), and the flags of
1067 * this area are about to be changed to vm_flags - and the no-change
1068 * case has already been eliminated.
1069 *
1070 * The following mprotect cases have to be considered, where AAAA is
1071 * the area passed down from mprotect_fixup, never extending beyond one
1072 * vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after:
1073 *
1074 *     AAAA             AAAA                AAAA          AAAA
1075 *    PPPPPPNNNNNN    PPPPPPNNNNNN    PPPPPPNNNNNN    PPPPNNNNXXXX
1076 *    cannot merge    might become    might become    might become
1077 *                    PPNNNNNNNNNN    PPPPPPPPPPNN    PPPPPPPPPPPP 6 or
1078 *    mmap, brk or    case 4 below    case 5 below    PPPPPPPPXXXX 7 or
1079 *    mremap move:                                    PPPPXXXXXXXX 8
1080 *        AAAA
1081 *    PPPP    NNNN    PPPPPPPPPPPP    PPPPPPPPNNNN    PPPPNNNNNNNN
1082 *    might become    case 1 below    case 2 below    case 3 below
1083 *
1084 * It is important for case 8 that the the vma NNNN overlapping the
1085 * region AAAA is never going to extended over XXXX. Instead XXXX must
1086 * be extended in region AAAA and NNNN must be removed. This way in
1087 * all cases where vma_merge succeeds, the moment vma_adjust drops the
1088 * rmap_locks, the properties of the merged vma will be already
1089 * correct for the whole merged range. Some of those properties like
1090 * vm_page_prot/vm_flags may be accessed by rmap_walks and they must
1091 * be correct for the whole merged range immediately after the
1092 * rmap_locks are released. Otherwise if XXXX would be removed and
1093 * NNNN would be extended over the XXXX range, remove_migration_ptes
1094 * or other rmap walkers (if working on addresses beyond the "end"
1095 * parameter) may establish ptes with the wrong permissions of NNNN
1096 * instead of the right permissions of XXXX.
1097 */
1098struct vm_area_struct *vma_merge(struct mm_struct *mm,
1099                        struct vm_area_struct *prev, unsigned long addr,
1100                        unsigned long end, unsigned long vm_flags,
1101                        struct anon_vma *anon_vma, struct file *file,
1102                        pgoff_t pgoff, struct mempolicy *policy,
1103                        struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
1104{
1105        pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
1106        struct vm_area_struct *area, *next;
1107        int err;
1108
1109        /*
1110         * We later require that vma->vm_flags == vm_flags,
1111         * so this tests vma->vm_flags & VM_SPECIAL, too.
1112         */
1113        if (vm_flags & VM_SPECIAL)
1114                return NULL;
1115
1116        if (prev)
1117                next = prev->vm_next;
1118        else
1119                next = mm->mmap;
1120        area = next;
1121        if (area && area->vm_end == end)                /* cases 6, 7, 8 */
1122                next = next->vm_next;
1123
1124        /* verify some invariant that must be enforced by the caller */
1125        VM_WARN_ON(prev && addr <= prev->vm_start);
1126        VM_WARN_ON(area && end > area->vm_end);
1127        VM_WARN_ON(addr >= end);
1128
1129        /*
1130         * Can it merge with the predecessor?
1131         */
1132        if (prev && prev->vm_end == addr &&
1133                        mpol_equal(vma_policy(prev), policy) &&
1134                        can_vma_merge_after(prev, vm_flags,
1135                                            anon_vma, file, pgoff,
1136                                            vm_userfaultfd_ctx)) {
1137                /*
1138                 * OK, it can.  Can we now merge in the successor as well?
1139                 */
1140                if (next && end == next->vm_start &&
1141                                mpol_equal(policy, vma_policy(next)) &&
1142                                can_vma_merge_before(next, vm_flags,
1143                                                     anon_vma, file,
1144                                                     pgoff+pglen,
1145                                                     vm_userfaultfd_ctx) &&
1146                                is_mergeable_anon_vma(prev->anon_vma,
1147                                                      next->anon_vma, NULL)) {
1148                                                        /* cases 1, 6 */
1149                        err = __vma_adjust(prev, prev->vm_start,
1150                                         next->vm_end, prev->vm_pgoff, NULL,
1151                                         prev);
1152                } else                                  /* cases 2, 5, 7 */
1153                        err = __vma_adjust(prev, prev->vm_start,
1154                                         end, prev->vm_pgoff, NULL, prev);
1155                if (err)
1156                        return NULL;
1157                khugepaged_enter_vma_merge(prev, vm_flags);
1158                return prev;
1159        }
1160
1161        /*
1162         * Can this new request be merged in front of next?
1163         */
1164        if (next && end == next->vm_start &&
1165                        mpol_equal(policy, vma_policy(next)) &&
1166                        can_vma_merge_before(next, vm_flags,
1167                                             anon_vma, file, pgoff+pglen,
1168                                             vm_userfaultfd_ctx)) {
1169                if (prev && addr < prev->vm_end)        /* case 4 */
1170                        err = __vma_adjust(prev, prev->vm_start,
1171                                         addr, prev->vm_pgoff, NULL, next);
1172                else {                                  /* cases 3, 8 */
1173                        err = __vma_adjust(area, addr, next->vm_end,
1174                                         next->vm_pgoff - pglen, NULL, next);
1175                        /*
1176                         * In case 3 area is already equal to next and
1177                         * this is a noop, but in case 8 "area" has
1178                         * been removed and next was expanded over it.
1179                         */
1180                        area = next;
1181                }
1182                if (err)
1183                        return NULL;
1184                khugepaged_enter_vma_merge(area, vm_flags);
1185                return area;
1186        }
1187
1188        return NULL;
1189}
1190
1191/*
1192 * Rough compatbility check to quickly see if it's even worth looking
1193 * at sharing an anon_vma.
1194 *
1195 * They need to have the same vm_file, and the flags can only differ
1196 * in things that mprotect may change.
1197 *
1198 * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that
1199 * we can merge the two vma's. For example, we refuse to merge a vma if
1200 * there is a vm_ops->close() function, because that indicates that the
1201 * driver is doing some kind of reference counting. But that doesn't
1202 * really matter for the anon_vma sharing case.
1203 */
1204static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b)
1205{
1206        return a->vm_end == b->vm_start &&
1207                mpol_equal(vma_policy(a), vma_policy(b)) &&
1208                a->vm_file == b->vm_file &&
1209                !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC|VM_SOFTDIRTY)) &&
1210                b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT);
1211}
1212
1213/*
1214 * Do some basic sanity checking to see if we can re-use the anon_vma
1215 * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be
1216 * the same as 'old', the other will be the new one that is trying
1217 * to share the anon_vma.
1218 *
1219 * NOTE! This runs with mm_sem held for reading, so it is possible that
1220 * the anon_vma of 'old' is concurrently in the process of being set up
1221 * by another page fault trying to merge _that_. But that's ok: if it
1222 * is being set up, that automatically means that it will be a singleton
1223 * acceptable for merging, so we can do all of this optimistically. But
1224 * we do that READ_ONCE() to make sure that we never re-load the pointer.
1225 *
1226 * IOW: that the "list_is_singular()" test on the anon_vma_chain only
1227 * matters for the 'stable anon_vma' case (ie the thing we want to avoid
1228 * is to return an anon_vma that is "complex" due to having gone through
1229 * a fork).
1230 *
1231 * We also make sure that the two vma's are compatible (adjacent,
1232 * and with the same memory policies). That's all stable, even with just
1233 * a read lock on the mm_sem.
1234 */
1235static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b)
1236{
1237        if (anon_vma_compatible(a, b)) {
1238                struct anon_vma *anon_vma = READ_ONCE(old->anon_vma);
1239
1240                if (anon_vma && list_is_singular(&old->anon_vma_chain))
1241                        return anon_vma;
1242        }
1243        return NULL;
1244}
1245
1246/*
1247 * find_mergeable_anon_vma is used by anon_vma_prepare, to check
1248 * neighbouring vmas for a suitable anon_vma, before it goes off
1249 * to allocate a new anon_vma.  It checks because a repetitive
1250 * sequence of mprotects and faults may otherwise lead to distinct
1251 * anon_vmas being allocated, preventing vma merge in subsequent
1252 * mprotect.
1253 */
1254struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
1255{
1256        struct anon_vma *anon_vma;
1257        struct vm_area_struct *near;
1258
1259        near = vma->vm_next;
1260        if (!near)
1261                goto try_prev;
1262
1263        anon_vma = reusable_anon_vma(near, vma, near);
1264        if (anon_vma)
1265                return anon_vma;
1266try_prev:
1267        near = vma->vm_prev;
1268        if (!near)
1269                goto none;
1270
1271        anon_vma = reusable_anon_vma(near, near, vma);
1272        if (anon_vma)
1273                return anon_vma;
1274none:
1275        /*
1276         * There's no absolute need to look only at touching neighbours:
1277         * we could search further afield for "compatible" anon_vmas.
1278         * But it would probably just be a waste of time searching,
1279         * or lead to too many vmas hanging off the same anon_vma.
1280         * We're trying to allow mprotect remerging later on,
1281         * not trying to minimize memory used for anon_vmas.
1282         */
1283        return NULL;
1284}
1285
1286/*
1287 * If a hint addr is less than mmap_min_addr change hint to be as
1288 * low as possible but still greater than mmap_min_addr
1289 */
1290static inline unsigned long round_hint_to_min(unsigned long hint)
1291{
1292        hint &= PAGE_MASK;
1293        if (((void *)hint != NULL) &&
1294            (hint < mmap_min_addr))
1295                return PAGE_ALIGN(mmap_min_addr);
1296        return hint;
1297}
1298
1299static inline int mlock_future_check(struct mm_struct *mm,
1300                                     unsigned long flags,
1301                                     unsigned long len)
1302{
1303        unsigned long locked, lock_limit;
1304
1305        /*  mlock MCL_FUTURE? */
1306        if (flags & VM_LOCKED) {
1307                locked = len >> PAGE_SHIFT;
1308                locked += mm->locked_vm;
1309                lock_limit = rlimit(RLIMIT_MEMLOCK);
1310                lock_limit >>= PAGE_SHIFT;
1311                if (locked > lock_limit && !capable(CAP_IPC_LOCK))
1312                        return -EAGAIN;
1313        }
1314        return 0;
1315}
1316
1317/*
1318 * The caller must hold down_write(&current->mm->mmap_sem).
1319 */
1320unsigned long do_mmap(struct file *file, unsigned long addr,
1321                        unsigned long len, unsigned long prot,
1322                        unsigned long flags, vm_flags_t vm_flags,
1323                        unsigned long pgoff, unsigned long *populate,
1324                        struct list_head *uf)
1325{
1326        struct mm_struct *mm = current->mm;
1327        int pkey = 0;
1328
1329        *populate = 0;
1330
1331        if (!len)
1332                return -EINVAL;
1333
1334        /*
1335         * Does the application expect PROT_READ to imply PROT_EXEC?
1336         *
1337         * (the exception is when the underlying filesystem is noexec
1338         *  mounted, in which case we dont add PROT_EXEC.)
1339         */
1340        if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
1341                if (!(file && path_noexec(&file->f_path)))
1342                        prot |= PROT_EXEC;
1343
1344        if (!(flags & MAP_FIXED))
1345                addr = round_hint_to_min(addr);
1346
1347        /* Careful about overflows.. */
1348        len = PAGE_ALIGN(len);
1349        if (!len)
1350                return -ENOMEM;
1351
1352        /* offset overflow? */
1353        if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
1354                return -EOVERFLOW;
1355
1356        /* Too many mappings? */
1357        if (mm->map_count > sysctl_max_map_count)
1358                return -ENOMEM;
1359
1360        /* Obtain the address to map to. we verify (or select) it and ensure
1361         * that it represents a valid section of the address space.
1362         */
1363        addr = get_unmapped_area(file, addr, len, pgoff, flags);
1364        if (offset_in_page(addr))
1365                return addr;
1366
1367        if (prot == PROT_EXEC) {
1368                pkey = execute_only_pkey(mm);
1369                if (pkey < 0)
1370                        pkey = 0;
1371        }
1372
1373        /* Do simple checking here so the lower-level routines won't have
1374         * to. we assume access permissions have been handled by the open
1375         * of the memory object, so we don't do any here.
1376         */
1377        vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
1378                        mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
1379
1380        if (flags & MAP_LOCKED)
1381                if (!can_do_mlock())
1382                        return -EPERM;
1383
1384        if (mlock_future_check(mm, vm_flags, len))
1385                return -EAGAIN;
1386
1387        if (file) {
1388                struct inode *inode = file_inode(file);
1389
1390                switch (flags & MAP_TYPE) {
1391                case MAP_SHARED:
1392                        if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
1393                                return -EACCES;
1394
1395                        /*
1396                         * Make sure we don't allow writing to an append-only
1397                         * file..
1398                         */
1399                        if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
1400                                return -EACCES;
1401
1402                        /*
1403                         * Make sure there are no mandatory locks on the file.
1404                         */
1405                        if (locks_verify_locked(file))
1406                                return -EAGAIN;
1407
1408                        vm_flags |= VM_SHARED | VM_MAYSHARE;
1409                        if (!(file->f_mode & FMODE_WRITE))
1410                                vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
1411
1412                        /* fall through */
1413                case MAP_PRIVATE:
1414                        if (!(file->f_mode & FMODE_READ))
1415                                return -EACCES;
1416                        if (path_noexec(&file->f_path)) {
1417                                if (vm_flags & VM_EXEC)
1418                                        return -EPERM;
1419                                vm_flags &= ~VM_MAYEXEC;
1420                        }
1421
1422                        if (!file->f_op->mmap)
1423                                return -ENODEV;
1424                        if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
1425                                return -EINVAL;
1426                        break;
1427
1428                default:
1429                        return -EINVAL;
1430                }
1431        } else {
1432                switch (flags & MAP_TYPE) {
1433                case MAP_SHARED:
1434                        if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
1435                                return -EINVAL;
1436                        /*
1437                         * Ignore pgoff.
1438                         */
1439                        pgoff = 0;
1440                        vm_flags |= VM_SHARED | VM_MAYSHARE;
1441                        break;
1442                case MAP_PRIVATE:
1443                        /*
1444                         * Set pgoff according to addr for anon_vma.
1445                         */
1446                        pgoff = addr >> PAGE_SHIFT;
1447                        break;
1448                default:
1449                        return -EINVAL;
1450                }
1451        }
1452
1453        /*
1454         * Set 'VM_NORESERVE' if we should not account for the
1455         * memory use of this mapping.
1456         */
1457        if (flags & MAP_NORESERVE) {
1458                /* We honor MAP_NORESERVE if allowed to overcommit */
1459                if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
1460                        vm_flags |= VM_NORESERVE;
1461
1462                /* hugetlb applies strict overcommit unless MAP_NORESERVE */
1463                if (file && is_file_hugepages(file))
1464                        vm_flags |= VM_NORESERVE;
1465        }
1466
1467        addr = mmap_region(file, addr, len, vm_flags, pgoff, uf);
1468        if (!IS_ERR_VALUE(addr) &&
1469            ((vm_flags & VM_LOCKED) ||
1470             (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
1471                *populate = len;
1472        return addr;
1473}
1474
1475SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1476                unsigned long, prot, unsigned long, flags,
1477                unsigned long, fd, unsigned long, pgoff)
1478{
1479        struct file *file = NULL;
1480        unsigned long retval;
1481
1482        if (!(flags & MAP_ANONYMOUS)) {
1483                audit_mmap_fd(fd, flags);
1484                file = fget(fd);
1485                if (!file)
1486                        return -EBADF;
1487                if (is_file_hugepages(file))
1488                        len = ALIGN(len, huge_page_size(hstate_file(file)));
1489                retval = -EINVAL;
1490                if (unlikely(flags & MAP_HUGETLB && !is_file_hugepages(file)))
1491                        goto out_fput;
1492        } else if (flags & MAP_HUGETLB) {
1493                struct user_struct *user = NULL;
1494                struct hstate *hs;
1495
1496                hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
1497                if (!hs)
1498                        return -EINVAL;
1499
1500                len = ALIGN(len, huge_page_size(hs));
1501                /*
1502                 * VM_NORESERVE is used because the reservations will be
1503                 * taken when vm_ops->mmap() is called
1504                 * A dummy user value is used because we are not locking
1505                 * memory so no accounting is necessary
1506                 */
1507                file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
1508                                VM_NORESERVE,
1509                                &user, HUGETLB_ANONHUGE_INODE,
1510                                (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
1511                if (IS_ERR(file))
1512                        return PTR_ERR(file);
1513        }
1514
1515        flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
1516
1517        retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
1518out_fput:
1519        if (file)
1520                fput(file);
1521        return retval;
1522}
1523
1524#ifdef __ARCH_WANT_SYS_OLD_MMAP
1525struct mmap_arg_struct {
1526        unsigned long addr;
1527        unsigned long len;
1528        unsigned long prot;
1529        unsigned long flags;
1530        unsigned long fd;
1531        unsigned long offset;
1532};
1533
1534SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
1535{
1536        struct mmap_arg_struct a;
1537
1538        if (copy_from_user(&a, arg, sizeof(a)))
1539                return -EFAULT;
1540        if (offset_in_page(a.offset))
1541                return -EINVAL;
1542
1543        return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
1544                              a.offset >> PAGE_SHIFT);
1545}
1546#endif /* __ARCH_WANT_SYS_OLD_MMAP */
1547
1548/*
1549 * Some shared mappigns will want the pages marked read-only
1550 * to track write events. If so, we'll downgrade vm_page_prot
1551 * to the private version (using protection_map[] without the
1552 * VM_SHARED bit).
1553 */
1554int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
1555{
1556        vm_flags_t vm_flags = vma->vm_flags;
1557        const struct vm_operations_struct *vm_ops = vma->vm_ops;
1558
1559        /* If it was private or non-writable, the write bit is already clear */
1560        if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED)))
1561                return 0;
1562
1563        /* The backer wishes to know when pages are first written to? */
1564        if (vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite))
1565                return 1;
1566
1567        /* The open routine did something to the protections that pgprot_modify
1568         * won't preserve? */
1569        if (pgprot_val(vm_page_prot) !=
1570            pgprot_val(vm_pgprot_modify(vm_page_prot, vm_flags)))
1571                return 0;
1572
1573        /* Do we need to track softdirty? */
1574        if (IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) && !(vm_flags & VM_SOFTDIRTY))
1575                return 1;
1576
1577        /* Specialty mapping? */
1578        if (vm_flags & VM_PFNMAP)
1579                return 0;
1580
1581        /* Can the mapping track the dirty pages? */
1582        return vma->vm_file && vma->vm_file->f_mapping &&
1583                mapping_cap_account_dirty(vma->vm_file->f_mapping);
1584}
1585
1586/*
1587 * We account for memory if it's a private writeable mapping,
1588 * not hugepages and VM_NORESERVE wasn't set.
1589 */
1590static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
1591{
1592        /*
1593         * hugetlb has its own accounting separate from the core VM
1594         * VM_HUGETLB may not be set yet so we cannot check for that flag.
1595         */
1596        if (file && is_file_hugepages(file))
1597                return 0;
1598
1599        return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
1600}
1601
1602unsigned long mmap_region(struct file *file, unsigned long addr,
1603                unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
1604                struct list_head *uf)
1605{
1606        struct mm_struct *mm = current->mm;
1607        struct vm_area_struct *vma, *prev;
1608        int error;
1609        struct rb_node **rb_link, *rb_parent;
1610        unsigned long charged = 0;
1611
1612        /* Check against address space limit. */
1613        if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
1614                unsigned long nr_pages;
1615
1616                /*
1617                 * MAP_FIXED may remove pages of mappings that intersects with
1618                 * requested mapping. Account for the pages it would unmap.
1619                 */
1620                nr_pages = count_vma_pages_range(mm, addr, addr + len);
1621
1622                if (!may_expand_vm(mm, vm_flags,
1623                                        (len >> PAGE_SHIFT) - nr_pages))
1624                        return -ENOMEM;
1625        }
1626
1627        /* Clear old maps */
1628        while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
1629                              &rb_parent)) {
1630                if (do_munmap(mm, addr, len, uf))
1631                        return -ENOMEM;
1632        }
1633
1634        /*
1635         * Private writable mapping: check memory availability
1636         */
1637        if (accountable_mapping(file, vm_flags)) {
1638                charged = len >> PAGE_SHIFT;
1639                if (security_vm_enough_memory_mm(mm, charged))
1640                        return -ENOMEM;
1641                vm_flags |= VM_ACCOUNT;
1642        }
1643
1644        /*
1645         * Can we just expand an old mapping?
1646         */
1647        vma = vma_merge(mm, prev, addr, addr + len, vm_flags,
1648                        NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX);
1649        if (vma)
1650                goto out;
1651
1652        /*
1653         * Determine the object being mapped and call the appropriate
1654         * specific mapper. the address has already been validated, but
1655         * not unmapped, but the maps are removed from the list.
1656         */
1657        vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
1658        if (!vma) {
1659                error = -ENOMEM;
1660                goto unacct_error;
1661        }
1662
1663        vma->vm_mm = mm;
1664        vma->vm_start = addr;
1665        vma->vm_end = addr + len;
1666        vma->vm_flags = vm_flags;
1667        vma->vm_page_prot = vm_get_page_prot(vm_flags);
1668        vma->vm_pgoff = pgoff;
1669        INIT_LIST_HEAD(&vma->anon_vma_chain);
1670
1671        if (file) {
1672                if (vm_flags & VM_DENYWRITE) {
1673                        error = deny_write_access(file);
1674                        if (error)
1675                                goto free_vma;
1676                }
1677                if (vm_flags & VM_SHARED) {
1678                        error = mapping_map_writable(file->f_mapping);
1679                        if (error)
1680                                goto allow_write_and_free_vma;
1681                }
1682
1683                /* ->mmap() can change vma->vm_file, but must guarantee that
1684                 * vma_link() below can deny write-access if VM_DENYWRITE is set
1685                 * and map writably if VM_SHARED is set. This usually means the
1686                 * new file must not have been exposed to user-space, yet.
1687                 */
1688                vma->vm_file = get_file(file);
1689                error = call_mmap(file, vma);
1690                if (error)
1691                        goto unmap_and_free_vma;
1692
1693                /* Can addr have changed??
1694                 *
1695                 * Answer: Yes, several device drivers can do it in their
1696                 *         f_op->mmap method. -DaveM
1697                 * Bug: If addr is changed, prev, rb_link, rb_parent should
1698                 *      be updated for vma_link()
1699                 */
1700                WARN_ON_ONCE(addr != vma->vm_start);
1701
1702                addr = vma->vm_start;
1703                vm_flags = vma->vm_flags;
1704        } else if (vm_flags & VM_SHARED) {
1705                error = shmem_zero_setup(vma);
1706                if (error)
1707                        goto free_vma;
1708        }
1709
1710        vma_link(mm, vma, prev, rb_link, rb_parent);
1711        /* Once vma denies write, undo our temporary denial count */
1712        if (file) {
1713                if (vm_flags & VM_SHARED)
1714                        mapping_unmap_writable(file->f_mapping);
1715                if (vm_flags & VM_DENYWRITE)
1716                        allow_write_access(file);
1717        }
1718        file = vma->vm_file;
1719out:
1720        perf_event_mmap(vma);
1721
1722        vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
1723        if (vm_flags & VM_LOCKED) {
1724                if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) ||
1725                                        vma == get_gate_vma(current->mm)))
1726                        mm->locked_vm += (len >> PAGE_SHIFT);
1727                else
1728                        vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
1729        }
1730
1731        if (file)
1732                uprobe_mmap(vma);
1733
1734        /*
1735         * New (or expanded) vma always get soft dirty status.
1736         * Otherwise user-space soft-dirty page tracker won't
1737         * be able to distinguish situation when vma area unmapped,
1738         * then new mapped in-place (which must be aimed as
1739         * a completely new data area).
1740         */
1741        vma->vm_flags |= VM_SOFTDIRTY;
1742
1743        vma_set_page_prot(vma);
1744
1745        return addr;
1746
1747unmap_and_free_vma:
1748        vma->vm_file = NULL;
1749        fput(file);
1750
1751        /* Undo any partial mapping done by a device driver. */
1752        unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
1753        charged = 0;
1754        if (vm_flags & VM_SHARED)
1755                mapping_unmap_writable(file->f_mapping);
1756allow_write_and_free_vma:
1757        if (vm_flags & VM_DENYWRITE)
1758                allow_write_access(file);
1759free_vma:
1760        kmem_cache_free(vm_area_cachep, vma);
1761unacct_error:
1762        if (charged)
1763                vm_unacct_memory(charged);
1764        return error;
1765}
1766
1767unsigned long unmapped_area(struct vm_unmapped_area_info *info)
1768{
1769        /*
1770         * We implement the search by looking for an rbtree node that
1771         * immediately follows a suitable gap. That is,
1772         * - gap_start = vma->vm_prev->vm_end <= info->high_limit - length;
1773         * - gap_end   = vma->vm_start        >= info->low_limit  + length;
1774         * - gap_end - gap_start >= length
1775         */
1776
1777        struct mm_struct *mm = current->mm;
1778        struct vm_area_struct *vma;
1779        unsigned long length, low_limit, high_limit, gap_start, gap_end;
1780
1781        /* Adjust search length to account for worst case alignment overhead */
1782        length = info->length + info->align_mask;
1783        if (length < info->length)
1784                return -ENOMEM;
1785
1786        /* Adjust search limits by the desired length */
1787        if (info->high_limit < length)
1788                return -ENOMEM;
1789        high_limit = info->high_limit - length;
1790
1791        if (info->low_limit > high_limit)
1792                return -ENOMEM;
1793        low_limit = info->low_limit + length;
1794
1795        /* Check if rbtree root looks promising */
1796        if (RB_EMPTY_ROOT(&mm->mm_rb))
1797                goto check_highest;
1798        vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
1799        if (vma->rb_subtree_gap < length)
1800                goto check_highest;
1801
1802        while (true) {
1803                /* Visit left subtree if it looks promising */
1804                gap_end = vm_start_gap(vma);
1805                if (gap_end >= low_limit && vma->vm_rb.rb_left) {
1806                        struct vm_area_struct *left =
1807                                rb_entry(vma->vm_rb.rb_left,
1808                                         struct vm_area_struct, vm_rb);
1809                        if (left->rb_subtree_gap >= length) {
1810                                vma = left;
1811                                continue;
1812                        }
1813                }
1814
1815                gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0;
1816check_current:
1817                /* Check if current node has a suitable gap */
1818                if (gap_start > high_limit)
1819                        return -ENOMEM;
1820                if (gap_end >= low_limit &&
1821                    gap_end > gap_start && gap_end - gap_start >= length)
1822                        goto found;
1823
1824                /* Visit right subtree if it looks promising */
1825                if (vma->vm_rb.rb_right) {
1826                        struct vm_area_struct *right =
1827                                rb_entry(vma->vm_rb.rb_right,
1828                                         struct vm_area_struct, vm_rb);
1829                        if (right->rb_subtree_gap >= length) {
1830                                vma = right;
1831                                continue;
1832                        }
1833                }
1834
1835                /* Go back up the rbtree to find next candidate node */
1836                while (true) {
1837                        struct rb_node *prev = &vma->vm_rb;
1838                        if (!rb_parent(prev))
1839                                goto check_highest;
1840                        vma = rb_entry(rb_parent(prev),
1841                                       struct vm_area_struct, vm_rb);
1842                        if (prev == vma->vm_rb.rb_left) {
1843                                gap_start = vm_end_gap(vma->vm_prev);
1844                                gap_end = vm_start_gap(vma);
1845                                goto check_current;
1846                        }
1847                }
1848        }
1849
1850check_highest:
1851        /* Check highest gap, which does not precede any rbtree node */
1852        gap_start = mm->highest_vm_end;
1853        gap_end = ULONG_MAX;  /* Only for VM_BUG_ON below */
1854        if (gap_start > high_limit)
1855                return -ENOMEM;
1856
1857found:
1858        /* We found a suitable gap. Clip it with the original low_limit. */
1859        if (gap_start < info->low_limit)
1860                gap_start = info->low_limit;
1861
1862        /* Adjust gap address to the desired alignment */
1863        gap_start += (info->align_offset - gap_start) & info->align_mask;
1864
1865        VM_BUG_ON(gap_start + info->length > info->high_limit);
1866        VM_BUG_ON(gap_start + info->length > gap_end);
1867        return gap_start;
1868}
1869
1870unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
1871{
1872        struct mm_struct *mm = current->mm;
1873        struct vm_area_struct *vma;
1874        unsigned long length, low_limit, high_limit, gap_start, gap_end;
1875
1876        /* Adjust search length to account for worst case alignment overhead */
1877        length = info->length + info->align_mask;
1878        if (length < info->length)
1879                return -ENOMEM;
1880
1881        /*
1882         * Adjust search limits by the desired length.
1883         * See implementation comment at top of unmapped_area().
1884         */
1885        gap_end = info->high_limit;
1886        if (gap_end < length)
1887                return -ENOMEM;
1888        high_limit = gap_end - length;
1889
1890        if (info->low_limit > high_limit)
1891                return -ENOMEM;
1892        low_limit = info->low_limit + length;
1893
1894        /* Check highest gap, which does not precede any rbtree node */
1895        gap_start = mm->highest_vm_end;
1896        if (gap_start <= high_limit)
1897                goto found_highest;
1898
1899        /* Check if rbtree root looks promising */
1900        if (RB_EMPTY_ROOT(&mm->mm_rb))
1901                return -ENOMEM;
1902        vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
1903        if (vma->rb_subtree_gap < length)
1904                return -ENOMEM;
1905
1906        while (true) {
1907                /* Visit right subtree if it looks promising */
1908                gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0;
1909                if (gap_start <= high_limit && vma->vm_rb.rb_right) {
1910                        struct vm_area_struct *right =
1911                                rb_entry(vma->vm_rb.rb_right,
1912                                         struct vm_area_struct, vm_rb);
1913                        if (right->rb_subtree_gap >= length) {
1914                                vma = right;
1915                                continue;
1916                        }
1917                }
1918
1919check_current:
1920                /* Check if current node has a suitable gap */
1921                gap_end = vm_start_gap(vma);
1922                if (gap_end < low_limit)
1923                        return -ENOMEM;
1924                if (gap_start <= high_limit &&
1925                    gap_end > gap_start && gap_end - gap_start >= length)
1926                        goto found;
1927
1928                /* Visit left subtree if it looks promising */
1929                if (vma->vm_rb.rb_left) {
1930                        struct vm_area_struct *left =
1931                                rb_entry(vma->vm_rb.rb_left,
1932                                         struct vm_area_struct, vm_rb);
1933                        if (left->rb_subtree_gap >= length) {
1934                                vma = left;
1935                                continue;
1936                        }
1937                }
1938
1939                /* Go back up the rbtree to find next candidate node */
1940                while (true) {
1941                        struct rb_node *prev = &vma->vm_rb;
1942                        if (!rb_parent(prev))
1943                                return -ENOMEM;
1944                        vma = rb_entry(rb_parent(prev),
1945                                       struct vm_area_struct, vm_rb);
1946                        if (prev == vma->vm_rb.rb_right) {
1947                                gap_start = vma->vm_prev ?
1948                                        vm_end_gap(vma->vm_prev) : 0;
1949                                goto check_current;
1950                        }
1951                }
1952        }
1953
1954found:
1955        /* We found a suitable gap. Clip it with the original high_limit. */
1956        if (gap_end > info->high_limit)
1957                gap_end = info->high_limit;
1958
1959found_highest:
1960        /* Compute highest gap address at the desired alignment */
1961        gap_end -= info->length;
1962        gap_end -= (gap_end - info->align_offset) & info->align_mask;
1963
1964        VM_BUG_ON(gap_end < info->low_limit);
1965        VM_BUG_ON(gap_end < gap_start);
1966        return gap_end;
1967}
1968
1969/* Get an address range which is currently unmapped.
1970 * For shmat() with addr=0.
1971 *
1972 * Ugly calling convention alert:
1973 * Return value with the low bits set means error value,
1974 * ie
1975 *      if (ret & ~PAGE_MASK)
1976 *              error = ret;
1977 *
1978 * This function "knows" that -ENOMEM has the bits set.
1979 */
1980#ifndef HAVE_ARCH_UNMAPPED_AREA
1981unsigned long
1982arch_get_unmapped_area(struct file *filp, unsigned long addr,
1983                unsigned long len, unsigned long pgoff, unsigned long flags)
1984{
1985        struct mm_struct *mm = current->mm;
1986        struct vm_area_struct *vma, *prev;
1987        struct vm_unmapped_area_info info;
1988
1989        if (len > TASK_SIZE - mmap_min_addr)
1990                return -ENOMEM;
1991
1992        if (flags & MAP_FIXED)
1993                return addr;
1994
1995        if (addr) {
1996                addr = PAGE_ALIGN(addr);
1997                vma = find_vma_prev(mm, addr, &prev);
1998                if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
1999                    (!vma || addr + len <= vm_start_gap(vma)) &&
2000                    (!prev || addr >= vm_end_gap(prev)))

2001                        return addr;
2002        }
2003
2004        info.flags = 0;
2005        info.length = len;
2006        info.low_limit = mm->mmap_base;
2007        info.high_limit = TASK_SIZE;
2008        info.align_mask = 0;
2009        return vm_unmapped_area(&info);
2010}
2011#endif
2012
2013/*
2014 * This mmap-allocator allocates new areas top-down from below the
2015 * stack's low limit (the base):
2016 */
2017#ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
2018unsigned long
2019arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
2020                          const unsigned long len, const unsigned long pgoff,
2021                          const unsigned long flags)
2022{
2023        struct vm_area_struct *vma, *prev;
2024        struct mm_struct *mm = current->mm;
2025        unsigned long addr = addr0;
2026        struct vm_unmapped_area_info info;
2027
2028        /* requested length too big for entire address space */
2029        if (len > TASK_SIZE - mmap_min_addr)
2030                return -ENOMEM;
2031
2032        if (flags & MAP_FIXED)
2033                return addr;
2034
2035        /* requesting a specific address */
2036        if (addr) {
2037                addr = PAGE_ALIGN(addr);
2038                vma = find_vma_prev(mm, addr, &prev);
2039                if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
2040                                (!vma || addr + len <= vm_start_gap(vma)) &&
2041                                (!prev || addr >= vm_end_gap(prev)))
2042                        return addr;
2043        }
2044
2045        info.flags = VM_UNMAPPED_AREA_TOPDOWN;
2046        info.length = len;
2047        info.low_limit = max(PAGE_SIZE, mmap_min_addr);
2048        info.high_limit = mm->mmap_base;
2049        info.align_mask = 0;
2050        addr = vm_unmapped_area(&info);
2051
2052        /*
2053         * A failed mmap() very likely causes application failure,
2054         * so fall back to the bottom-up function here. This scenario
2055         * can happen with large stack limits and large mmap()
2056         * allocations.
2057         */
2058        if (offset_in_page(addr)) {
2059                VM_BUG_ON(addr != -ENOMEM);
2060                info.flags = 0;
2061                info.low_limit = TASK_UNMAPPED_BASE;
2062                info.high_limit = TASK_SIZE;
2063                addr = vm_unmapped_area(&info);
2064        }
2065
2066        return addr;
2067}
2068#endif
2069
2070unsigned long
2071get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
2072                unsigned long pgoff, unsigned long flags)
2073{
2074        unsigned long (*get_area)(struct file *, unsigned long,
2075                                  unsigned long, unsigned long, unsigned long);
2076
2077        unsigned long error = arch_mmap_check(addr, len, flags);
2078        if (error)
2079                return error;
2080
2081        /* Careful about overflows.. */
2082        if (len > TASK_SIZE)
2083                return -ENOMEM;
2084
2085        get_area = current->mm->get_unmapped_area;
2086        if (file) {
2087                if (file->f_op->get_unmapped_area)
2088                        get_area = file->f_op->get_unmapped_area;
2089        } else if (flags & MAP_SHARED) {
2090                /*
2091                 * mmap_region() will call shmem_zero_setup() to create a file,
2092                 * so use shmem's get_unmapped_area in case it can be huge.
2093                 * do_mmap_pgoff() will clear pgoff, so match alignment.
2094                 */
2095                pgoff = 0;
2096                get_area = shmem_get_unmapped_area;
2097        }
2098
2099        addr = get_area(file, addr, len, pgoff, flags);
2100        if (IS_ERR_VALUE(addr))
2101                return addr;
2102
2103        if (addr > TASK_SIZE - len)
2104                return -ENOMEM;
2105        if (offset_in_page(addr))
2106                return -EINVAL;
2107
2108        error = security_mmap_addr(addr);
2109        return error ? error : addr;
2110}
2111
2112EXPORT_SYMBOL(get_unmapped_area);
2113
2114/* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
2115struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
2116{
2117        struct rb_node *rb_node;
2118        struct vm_area_struct *vma;
2119
2120        /* Check the cache first. */
2121        vma = vmacache_find(mm, addr);
2122        if (likely(vma))
2123                return vma;
2124
2125        rb_node = mm->mm_rb.rb_node;
2126
2127        while (rb_node) {
2128                struct vm_area_struct *tmp;
2129
2130                tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
2131
2132                if (tmp->vm_end > addr) {
2133                        vma = tmp;
2134                        if (tmp->vm_start <= addr)
2135                                break;
2136                        rb_node = rb_node->rb_left;
2137                } else
2138                        rb_node = rb_node->rb_right;
2139        }
2140
2141        if (vma)
2142                vmacache_update(addr, vma);
2143        return vma;
2144}
2145
2146EXPORT_SYMBOL(find_vma);
2147
2148/*
2149 * Same as find_vma, but also return a pointer to the previous VMA in *pprev.
2150 */
2151struct vm_area_struct *
2152find_vma_prev(struct mm_struct *mm, unsigned long addr,
2153                        struct vm_area_struct **pprev)
2154{
2155        struct vm_area_struct *vma;
2156
2157        vma = find_vma(mm, addr);
2158        if (vma) {
2159                *pprev = vma->vm_prev;
2160        } else {
2161                struct rb_node *rb_node = mm->mm_rb.rb_node;
2162                *pprev = NULL;
2163                while (rb_node) {
2164                        *pprev = rb_entry(rb_node, struct vm_area_struct, vm_rb);
2165                        rb_node = rb_node->rb_right;
2166                }
2167        }
2168        return vma;
2169}
2170
2171/*
2172 * Verify that the stack growth is acceptable and
2173 * update accounting. This is shared with both the
2174 * grow-up and grow-down cases.
2175 */
2176static int acct_stack_growth(struct vm_area_struct *vma,
2177                             unsigned long size, unsigned long grow)
2178{
2179        struct mm_struct *mm = vma->vm_mm;
2180        unsigned long new_start;
2181
2182        /* address space limit tests */
2183        if (!may_expand_vm(mm, vma->vm_flags, grow))
2184                return -ENOMEM;
2185
2186        /* Stack limit test */
2187        if (size > rlimit(RLIMIT_STACK))
2188                return -ENOMEM;
2189
2190        /* mlock limit tests */
2191        if (vma->vm_flags & VM_LOCKED) {
2192                unsigned long locked;
2193                unsigned long limit;
2194                locked = mm->locked_vm + grow;
2195                limit = rlimit(RLIMIT_MEMLOCK);
2196                limit >>= PAGE_SHIFT;
2197                if (locked > limit && !capable(CAP_IPC_LOCK))
2198                        return -ENOMEM;
2199        }
2200
2201        /* Check to ensure the stack will not grow into a hugetlb-only region */
2202        new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start :
2203                        vma->vm_end - size;
2204        if (is_hugepage_only_range(vma->vm_mm, new_start, size))
2205                return -EFAULT;
2206
2207        /*
2208         * Overcommit..  This must be the final test, as it will
2209         * update security statistics.
2210         */
2211        if (security_vm_enough_memory_mm(mm, grow))
2212                return -ENOMEM;
2213
2214        return 0;
2215}
2216
2217#if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64)
2218/*
2219 * PA-RISC uses this for its stack; IA64 for its Register Backing Store.
2220 * vma is the last one with address > vma->vm_end.  Have to extend vma.
2221 */
2222int expand_upwards(struct vm_area_struct *vma, unsigned long address)
2223{
2224        struct mm_struct *mm = vma->vm_mm;
2225        struct vm_area_struct *next;
2226        unsigned long gap_addr;
2227        int error = 0;
2228
2229        if (!(vma->vm_flags & VM_GROWSUP))
2230                return -EFAULT;
2231
2232        /* Guard against exceeding limits of the address space. */
2233        address &= PAGE_MASK;
2234        if (address >= (TASK_SIZE & PAGE_MASK))
2235                return -ENOMEM;
2236        address += PAGE_SIZE;
2237
2238        /* Enforce stack_guard_gap */
2239        gap_addr = address + stack_guard_gap;
2240
2241        /* Guard against overflow */
2242        if (gap_addr < address || gap_addr > TASK_SIZE)
2243                gap_addr = TASK_SIZE;
2244
2245        next = vma->vm_next;
2246        if (next && next->vm_start < gap_addr &&
2247                        (next->vm_flags & (VM_WRITE|VM_READ|VM_EXEC))) {
2248                if (!(next->vm_flags & VM_GROWSUP))
2249                        return -ENOMEM;
2250                /* Check that both stack segments have the same anon_vma? */
2251        }
2252
2253        /* We must make sure the anon_vma is allocated. */
2254        if (unlikely(anon_vma_prepare(vma)))
2255                return -ENOMEM;
2256
2257        /*
2258         * vma->vm_start/vm_end cannot change under us because the caller
2259         * is required to hold the mmap_sem in read mode.  We need the
2260         * anon_vma lock to serialize against concurrent expand_stacks.
2261         */
2262        anon_vma_lock_write(vma->anon_vma);
2263
2264        /* Somebody else might have raced and expanded it already */
2265        if (address > vma->vm_end) {
2266                unsigned long size, grow;
2267
2268                size = address - vma->vm_start;
2269                grow = (address - vma->vm_end) >> PAGE_SHIFT;
2270
2271                error = -ENOMEM;
2272                if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
2273                        error = acct_stack_growth(vma, size, grow);
2274                        if (!error) {
2275                                /*
2276                                 * vma_gap_update() doesn't support concurrent
2277                                 * updates, but we only hold a shared mmap_sem
2278                                 * lock here, so we need to protect against
2279                                 * concurrent vma expansions.
2280                                 * anon_vma_lock_write() doesn't help here, as
2281                                 * we don't guarantee that all growable vmas
2282                                 * in a mm share the same root anon vma.
2283                                 * So, we reuse mm->page_table_lock to guard
2284                                 * against concurrent vma expansions.
2285                                 */
2286                                spin_lock(&mm->page_table_lock);
2287                                if (vma->vm_flags & VM_LOCKED)
2288                                        mm->locked_vm += grow;
2289                                vm_stat_account(mm, vma->vm_flags, grow);
2290                                anon_vma_interval_tree_pre_update_vma(vma);
2291                                vma->vm_end = address;
2292                                anon_vma_interval_tree_post_update_vma(vma);
2293                                if (vma->vm_next)
2294                                        vma_gap_update(vma->vm_next);
2295                                else
2296                                        mm->highest_vm_end = vm_end_gap(vma);
2297                                spin_unlock(&mm->page_table_lock);
2298
2299                                perf_event_mmap(vma);
2300                        }
2301                }
2302        }
2303        anon_vma_unlock_write(vma->anon_vma);
2304        khugepaged_enter_vma_merge(vma, vma->vm_flags);
2305        validate_mm(mm);
2306        return error;
2307}
2308#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
2309
2310/*
2311 * vma is the first one with address < vma->vm_start.  Have to extend vma.
2312 */
2313int expand_downwards(struct vm_area_struct *vma,
2314                                   unsigned long address)
2315{
2316        struct mm_struct *mm = vma->vm_mm;
2317        struct vm_area_struct *prev;
2318        int error;
2319
2320        address &= PAGE_MASK;
2321        error = security_mmap_addr(address);
2322        if (error)
2323                return error;
2324
2325        /* Enforce stack_guard_gap */
2326        prev = vma->vm_prev;
2327        /* Check that both stack segments have the same anon_vma? */
2328        if (prev && !(prev->vm_flags & VM_GROWSDOWN) &&
2329                        (prev->vm_flags & (VM_WRITE|VM_READ|VM_EXEC))) {
2330                if (address - prev->vm_end < stack_guard_gap)
2331                        return -ENOMEM;
2332        }
2333
2334        /* We must make sure the anon_vma is allocated. */
2335        if (unlikely(anon_vma_prepare(vma)))
2336                return -ENOMEM;
2337
2338        /*
2339         * vma->vm_start/vm_end cannot change under us because the caller
2340         * is required to hold the mmap_sem in read mode.  We need the
2341         * anon_vma lock to serialize against concurrent expand_stacks.
2342         */
2343        anon_vma_lock_write(vma->anon_vma);
2344
2345        /* Somebody else might have raced and expanded it already */
2346        if (address < vma->vm_start) {
2347                unsigned long size, grow;
2348
2349                size = vma->vm_end - address;
2350                grow = (vma->vm_start - address) >> PAGE_SHIFT;
2351
2352                error = -ENOMEM;
2353                if (grow <= vma->vm_pgoff) {
2354                        error = acct_stack_growth(vma, size, grow);
2355                        if (!error) {
2356                                /*
2357                                 * vma_gap_update() doesn't support concurrent
2358                                 * updates, but we only hold a shared mmap_sem
2359                                 * lock here, so we need to protect against
2360                                 * concurrent vma expansions.
2361                                 * anon_vma_lock_write() doesn't help here, as
2362                                 * we don't guarantee that all growable vmas
2363                                 * in a mm share the same root anon vma.
2364                                 * So, we reuse mm->page_table_lock to guard
2365                                 * against concurrent vma expansions.
2366                                 */
2367                                spin_lock(&mm->page_table_lock);
2368                                if (vma->vm_flags & VM_LOCKED)
2369                                        mm->locked_vm += grow;
2370                                vm_stat_account(mm, vma->vm_flags, grow);
2371                                anon_vma_interval_tree_pre_update_vma(vma);
2372                                vma->vm_start = address;
2373                                vma->vm_pgoff -= grow;
2374                                anon_vma_interval_tree_post_update_vma(vma);
2375                                vma_gap_update(vma);
2376                                spin_unlock(&mm->page_table_lock);
2377
2378                                perf_event_mmap(vma);
2379                        }
2380                }
2381        }
2382        anon_vma_unlock_write(vma->anon_vma);
2383        khugepaged_enter_vma_merge(vma, vma->vm_flags);
2384        validate_mm(mm);
2385        return error;
2386}
2387
2388/* enforced gap between the expanding stack and other mappings. */
2389unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT;
2390
2391static int __init cmdline_parse_stack_guard_gap(char *p)
2392{
2393        unsigned long val;
2394        char *endptr;
2395
2396        val = simple_strtoul(p, &endptr, 10);
2397        if (!*endptr)
2398                stack_guard_gap = val << PAGE_SHIFT;
2399
2400        return 0;
2401}
2402__setup("stack_guard_gap=", cmdline_parse_stack_guard_gap);
2403
2404#ifdef CONFIG_STACK_GROWSUP
2405int expand_stack(struct vm_area_struct *vma, unsigned long address)
2406{
2407        return expand_upwards(vma, address);
2408}
2409
2410struct vm_area_struct *
2411find_extend_vma(struct mm_struct *mm, unsigned long addr)
2412{
2413        struct vm_area_struct *vma, *prev;
2414
2415        addr &= PAGE_MASK;
2416        vma = find_vma_prev(mm, addr, &prev);
2417        if (vma && (vma->vm_start <= addr))
2418                return vma;
2419        if (!prev || expand_stack(prev, addr))
2420                return NULL;
2421        if (prev->vm_flags & VM_LOCKED)
2422                populate_vma_page_range(prev, addr, prev->vm_end, NULL);
2423        return prev;
2424}
2425#else
2426int expand_stack(struct vm_area_struct *vma, unsigned long address)
2427{
2428        return expand_downwards(vma, address);
2429}
2430
2431struct vm_area_struct *
2432find_extend_vma(struct mm_struct *mm, unsigned long addr)
2433{
2434        struct vm_area_struct *vma;
2435        unsigned long start;
2436
2437        addr &= PAGE_MASK;
2438        vma = find_vma(mm, addr);
2439        if (!vma)
2440                return NULL;
2441        if (vma->vm_start <= addr)
2442                return vma;
2443        if (!(vma->vm_flags & VM_GROWSDOWN))
2444                return NULL;
2445        start = vma->vm_start;
2446        if (expand_stack(vma, addr))
2447                return NULL;
2448        if (vma->vm_flags & VM_LOCKED)
2449                populate_vma_page_range(vma, addr, start, NULL);
2450        return vma;
2451}
2452#endif
2453
2454EXPORT_SYMBOL_GPL(find_extend_vma);
2455
2456/*
2457 * Ok - we have the memory areas we should free on the vma list,
2458 * so release them, and do the vma updates.
2459 *
2460 * Called with the mm semaphore held.
2461 */
2462static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
2463{
2464        unsigned long nr_accounted = 0;
2465
2466        /* Update high watermark before we lower total_vm */
2467        update_hiwater_vm(mm);
2468        do {
2469                long nrpages = vma_pages(vma);
2470
2471                if (vma->vm_flags & VM_ACCOUNT)
2472                        nr_accounted += nrpages;
2473                vm_stat_account(mm, vma->vm_flags, -nrpages);
2474                vma = remove_vma(vma);
2475        } while (vma);
2476        vm_unacct_memory(nr_accounted);
2477        validate_mm(mm);
2478}
2479
2480/*
2481 * Get rid of page table information in the indicated region.
2482 *
2483 * Called with the mm semaphore held.
2484 */
2485static void unmap_region(struct mm_struct *mm,
2486                struct vm_area_struct *vma, struct vm_area_struct *prev,
2487                unsigned long start, unsigned long end)
2488{
2489        struct vm_area_struct *next = prev ? prev->vm_next : mm->mmap;
2490        struct mmu_gather tlb;
2491
2492        lru_add_drain();
2493        tlb_gather_mmu(&tlb, mm, start, end);
2494        update_hiwater_rss(mm);
2495        unmap_vmas(&tlb, vma, start, end);
2496        free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
2497                                 next ? next->vm_start : USER_PGTABLES_CEILING);
2498        tlb_finish_mmu(&tlb, start, end);
2499}
2500
2501/*
2502 * Create a list of vma's touched by the unmap, removing them from the mm's
2503 * vma list as we go..
2504 */
2505static void
2506detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
2507        struct vm_area_struct *prev, unsigned long end)
2508{
2509        struct vm_area_struct **insertion_point;
2510        struct vm_area_struct *tail_vma = NULL;
2511
2512        insertion_point = (prev ? &prev->vm_next : &mm->mmap);
2513        vma->vm_prev = NULL;
2514        do {
2515                vma_rb_erase(vma, &mm->mm_rb);
2516                mm->map_count--;
2517                tail_vma = vma;
2518                vma = vma->vm_next;
2519        } while (vma && vma->vm_start < end);
2520        *insertion_point = vma;
2521        if (vma) {
2522                vma->vm_prev = prev;
2523                vma_gap_update(vma);
2524        } else
2525                mm->highest_vm_end = prev ? vm_end_gap(prev) : 0;
2526        tail_vma->vm_next = NULL;
2527
2528        /* Kill the cache */
2529        vmacache_invalidate(mm);
2530}
2531
2532/*
2533 * __split_vma() bypasses sysctl_max_map_count checking.  We use this where it
2534 * has already been checked or doesn't make sense to fail.
2535 */
2536int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
2537                unsigned long addr, int new_below)
2538{
2539        struct vm_area_struct *new;
2540        int err;
2541
2542        if (is_vm_hugetlb_page(vma) && (addr &
2543                                        ~(huge_page_mask(hstate_vma(vma)))))
2544                return -EINVAL;
2545
2546        new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
2547        if (!new)
2548                return -ENOMEM;
2549
2550        /* most fields are the same, copy all, and then fixup */
2551        *new = *vma;
2552
2553        INIT_LIST_HEAD(&new->anon_vma_chain);
2554
2555        if (new_below)
2556                new->vm_end = addr;
2557        else {
2558                new->vm_start = addr;
2559                new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
2560        }
2561
2562        err = vma_dup_policy(vma, new);
2563        if (err)
2564                goto out_free_vma;
2565
2566        err = anon_vma_clone(new, vma);
2567        if (err)
2568                goto out_free_mpol;
2569
2570        if (new->vm_file)
2571                get_file(new->vm_file);
2572
2573        if (new->vm_ops && new->vm_ops->open)
2574                new->vm_ops->open(new);
2575
2576        if (new_below)
2577                err = vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff +
2578                        ((addr - new->vm_start) >> PAGE_SHIFT), new);
2579        else
2580                err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
2581
2582        /* Success. */
2583        if (!err)
2584                return 0;
2585
2586        /* Clean everything up if vma_adjust failed. */
2587        if (new->vm_ops && new->vm_ops->close)
2588                new->vm_ops->close(new);
2589        if (new->vm_file)
2590                fput(new->vm_file);
2591        unlink_anon_vmas(new);
2592 out_free_mpol:
2593        mpol_put(vma_policy(new));
2594 out_free_vma:
2595        kmem_cache_free(vm_area_cachep, new);
2596        return err;
2597}
2598
2599/*
2600 * Split a vma into two pieces at address 'addr', a new vma is allocated
2601 * either for the first part or the tail.
2602 */
2603int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
2604              unsigned long addr, int new_below)
2605{
2606        if (mm->map_count >= sysctl_max_map_count)
2607                return -ENOMEM;
2608
2609        return __split_vma(mm, vma, addr, new_below);
2610}
2611
2612/* Munmap is split into 2 main parts -- this part which finds
2613 * what needs doing, and the areas themselves, which do the
2614 * work.  This now handles partial unmappings.
2615 * Jeremy Fitzhardinge <jeremy@goop.org>
2616 */
2617int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
2618              struct list_head *uf)
2619{
2620        unsigned long end;
2621        struct vm_area_struct *vma, *prev, *last;
2622
2623        if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start)
2624                return -EINVAL;
2625
2626        len = PAGE_ALIGN(len);
2627        if (len == 0)
2628                return -EINVAL;
2629
2630        /* Find the first overlapping VMA */
2631        vma = find_vma(mm, start);
2632        if (!vma)
2633                return 0;
2634        prev = vma->vm_prev;
2635        /* we have  start < vma->vm_end  */
2636
2637        /* if it doesn't overlap, we have nothing.. */
2638        end = start + len;
2639        if (vma->vm_start >= end)
2640                return 0;
2641
2642        if (uf) {
2643                int error = userfaultfd_unmap_prep(vma, start, end, uf);
2644
2645                if (error)
2646                        return error;
2647        }
2648
2649        /*
2650         * If we need to split any vma, do it now to save pain later.
2651         *
2652         * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially
2653         * unmapped vm_area_struct will remain in use: so lower split_vma
2654         * places tmp vma above, and higher split_vma places tmp vma below.
2655         */
2656        if (start > vma->vm_start) {
2657                int error;
2658
2659                /*
2660                 * Make sure that map_count on return from munmap() will
2661                 * not exceed its limit; but let map_count go just above
2662                 * its limit temporarily, to help free resources as expected.
2663                 */
2664                if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
2665                        return -ENOMEM;
2666
2667                error = __split_vma(mm, vma, start, 0);
2668                if (error)
2669                        return error;
2670                prev = vma;
2671        }
2672
2673        /* Does it split the last one? */
2674        last = find_vma(mm, end);
2675        if (last && end > last->vm_start) {
2676                int error = __split_vma(mm, last, end, 1);
2677                if (error)
2678                        return error;
2679        }
2680        vma = prev ? prev->vm_next : mm->mmap;
2681
2682        /*
2683         * unlock any mlock()ed ranges before detaching vmas
2684         */
2685        if (mm->locked_vm) {
2686                struct vm_area_struct *tmp = vma;
2687                while (tmp && tmp->vm_start < end) {
2688                        if (tmp->vm_flags & VM_LOCKED) {
2689                                mm->locked_vm -= vma_pages(tmp);
2690                                munlock_vma_pages_all(tmp);
2691                        }
2692                        tmp = tmp->vm_next;
2693                }
2694        }
2695
2696        /*
2697         * Remove the vma's, and unmap the actual pages
2698         */
2699        detach_vmas_to_be_unmapped(mm, vma, prev, end);
2700        unmap_region(mm, vma, prev, start, end);
2701
2702        arch_unmap(mm, vma, start, end);
2703
2704        /* Fix up all other VM information */
2705        remove_vma_list(mm, vma);
2706
2707        return 0;
2708}
2709
2710int vm_munmap(unsigned long start, size_t len)
2711{
2712        int ret;
2713        struct mm_struct *mm = current->mm;
2714        LIST_HEAD(uf);
2715
2716        if (down_write_killable(&mm->mmap_sem))
2717                return -EINTR;
2718
2719        ret = do_munmap(mm, start, len, &uf);
2720        up_write(&mm->mmap_sem);
2721        userfaultfd_unmap_complete(mm, &uf);
2722        return ret;
2723}
2724EXPORT_SYMBOL(vm_munmap);
2725
2726SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
2727{
2728        profile_munmap(addr);
2729        return vm_munmap(addr, len);
2730}
2731
2732
2733/*
2734 * Emulation of deprecated remap_file_pages() syscall.
2735 */
2736SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
2737                unsigned long, prot, unsigned long, pgoff, unsigned long, flags)
2738{
2739
2740        struct mm_struct *mm = current->mm;
2741        struct vm_area_struct *vma;
2742        unsigned long populate = 0;
2743        unsigned long ret = -EINVAL;
2744        struct file *file;
2745
2746        pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/vm/remap_file_pages.txt.\n",
2747                     current->comm, current->pid);
2748
2749        if (prot)
2750                return ret;
2751        start = start & PAGE_MASK;
2752        size = size & PAGE_MASK;
2753
2754        if (start + size <= start)
2755                return ret;
2756
2757        /* Does pgoff wrap? */
2758        if (pgoff + (size >> PAGE_SHIFT) < pgoff)
2759                return ret;
2760
2761        if (down_write_killable(&mm->mmap_sem))
2762                return -EINTR;
2763
2764        vma = find_vma(mm, start);
2765
2766        if (!vma || !(vma->vm_flags & VM_SHARED))
2767                goto out;
2768
2769        if (start < vma->vm_start)
2770                goto out;
2771
2772        if (start + size > vma->vm_end) {
2773                struct vm_area_struct *next;
2774
2775                for (next = vma->vm_next; next; next = next->vm_next) {
2776                        /* hole between vmas ? */
2777                        if (next->vm_start != next->vm_prev->vm_end)
2778                                goto out;
2779
2780                        if (next->vm_file != vma->vm_file)
2781                                goto out;
2782
2783                        if (next->vm_flags != vma->vm_flags)
2784                                goto out;
2785
2786                        if (start + size <= next->vm_end)
2787                                break;
2788                }
2789
2790                if (!next)
2791                        goto out;
2792        }
2793
2794        prot |= vma->vm_flags & VM_READ ? PROT_READ : 0;
2795        prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0;
2796        prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0;
2797
2798        flags &= MAP_NONBLOCK;
2799        flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE;
2800        if (vma->vm_flags & VM_LOCKED) {
2801                struct vm_area_struct *tmp;
2802                flags |= MAP_LOCKED;
2803
2804                /* drop PG_Mlocked flag for over-mapped range */
2805                for (tmp = vma; tmp->vm_start >= start + size;
2806                                tmp = tmp->vm_next) {
2807                        /*
2808                         * Split pmd and munlock page on the border
2809                         * of the range.
2810                         */
2811                        vma_adjust_trans_huge(tmp, start, start + size, 0);
2812
2813                        munlock_vma_pages_range(tmp,
2814                                        max(tmp->vm_start, start),
2815                                        min(tmp->vm_end, start + size));
2816                }
2817        }
2818
2819        file = get_file(vma->vm_file);
2820        ret = do_mmap_pgoff(vma->vm_file, start, size,
2821                        prot, flags, pgoff, &populate, NULL);
2822        fput(file);
2823out:
2824        up_write(&mm->mmap_sem);
2825        if (populate)
2826                mm_populate(ret, populate);
2827        if (!IS_ERR_VALUE(ret))
2828                ret = 0;
2829        return ret;
2830}
2831
2832static inline void verify_mm_writelocked(struct mm_struct *mm)
2833{
2834#ifdef CONFIG_DEBUG_VM
2835        if (unlikely(down_read_trylock(&mm->mmap_sem))) {
2836                WARN_ON(1);
2837                up_read(&mm->mmap_sem);
2838        }
2839#endif
2840}
2841
2842/*
2843 *  this is really a simplified "do_mmap".  it only handles
2844 *  anonymous maps.  eventually we may be able to do some
2845 *  brk-specific accounting here.
2846 */
2847static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags, struct list_head *uf)
2848{
2849        struct mm_struct *mm = current->mm;
2850        struct vm_area_struct *vma, *prev;
2851        unsigned long len;
2852        struct rb_node **rb_link, *rb_parent;
2853        pgoff_t pgoff = addr >> PAGE_SHIFT;
2854        int error;
2855
2856        len = PAGE_ALIGN(request);
2857        if (len < request)
2858                return -ENOMEM;
2859        if (!len)
2860                return 0;
2861
2862        /* Until we need other flags, refuse anything except VM_EXEC. */
2863        if ((flags & (~VM_EXEC)) != 0)
2864                return -EINVAL;
2865        flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
2866
2867        error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
2868        if (offset_in_page(error))
2869                return error;
2870
2871        error = mlock_future_check(mm, mm->def_flags, len);
2872        if (error)
2873                return error;
2874
2875        /*
2876         * mm->mmap_sem is required to protect against another thread
2877         * changing the mappings in case we sleep.
2878         */
2879        verify_mm_writelocked(mm);
2880
2881        /*
2882         * Clear old maps.  this also does some error checking for us
2883         */
2884        while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
2885                              &rb_parent)) {
2886                if (do_munmap(mm, addr, len, uf))
2887                        return -ENOMEM;
2888        }
2889
2890        /* Check against address space limits *after* clearing old maps... */
2891        if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))
2892                return -ENOMEM;
2893
2894        if (mm->map_count > sysctl_max_map_count)
2895                return -ENOMEM;
2896
2897        if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
2898                return -ENOMEM;
2899
2900        /* Can we just expand an old private anonymous mapping? */
2901        vma = vma_merge(mm, prev, addr, addr + len, flags,
2902                        NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX);
2903        if (vma)
2904                goto out;
2905
2906        /*
2907         * create a vma struct for an anonymous mapping
2908         */
2909        vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
2910        if (!vma) {
2911                vm_unacct_memory(len >> PAGE_SHIFT);
2912                return -ENOMEM;
2913        }
2914
2915        INIT_LIST_HEAD(&vma->anon_vma_chain);
2916        vma->vm_mm = mm;
2917        vma->vm_start = addr;
2918        vma->vm_end = addr + len;
2919        vma->vm_pgoff = pgoff;
2920        vma->vm_flags = flags;
2921        vma->vm_page_prot = vm_get_page_prot(flags);
2922        vma_link(mm, vma, prev, rb_link, rb_parent);
2923out:
2924        perf_event_mmap(vma);
2925        mm->total_vm += len >> PAGE_SHIFT;
2926        mm->data_vm += len >> PAGE_SHIFT;
2927        if (flags & VM_LOCKED)
2928                mm->locked_vm += (len >> PAGE_SHIFT);
2929        vma->vm_flags |= VM_SOFTDIRTY;
2930        return 0;
2931}
2932
2933static int do_brk(unsigned long addr, unsigned long len, struct list_head *uf)
2934{
2935        return do_brk_flags(addr, len, 0, uf);
2936}
2937
2938int vm_brk_flags(unsigned long addr, unsigned long len, unsigned long flags)
2939{
2940        struct mm_struct *mm = current->mm;
2941        int ret;
2942        bool populate;
2943        LIST_HEAD(uf);
2944
2945        if (down_write_killable(&mm->mmap_sem))
2946                return -EINTR;
2947
2948        ret = do_brk_flags(addr, len, flags, &uf);
2949        populate = ((mm->def_flags & VM_LOCKED) != 0);
2950        up_write(&mm->mmap_sem);
2951        userfaultfd_unmap_complete(mm, &uf);
2952        if (populate && !ret)
2953                mm_populate(addr, len);
2954        return ret;
2955}
2956EXPORT_SYMBOL(vm_brk_flags);
2957
2958int vm_brk(unsigned long addr, unsigned long len)
2959{
2960        return vm_brk_flags(addr, len, 0);
2961}
2962EXPORT_SYMBOL(vm_brk);
2963
2964/* Release all mmaps. */
2965void exit_mmap(struct mm_struct *mm)
2966{
2967        struct mmu_gather tlb;
2968        struct vm_area_struct *vma;
2969        unsigned long nr_accounted = 0;
2970
2971        /* mm's last user has gone, and its about to be pulled down */
2972        mmu_notifier_release(mm);
2973
2974        if (mm->locked_vm) {
2975                vma = mm->mmap;
2976                while (vma) {
2977                        if (vma->vm_flags & VM_LOCKED)
2978                                munlock_vma_pages_all(vma);
2979                        vma = vma->vm_next;
2980                }
2981        }
2982
2983        arch_exit_mmap(mm);
2984
2985        vma = mm->mmap;
2986        if (!vma)       /* Can happen if dup_mmap() received an OOM */
2987                return;
2988
2989        lru_add_drain();
2990        flush_cache_mm(mm);
2991        tlb_gather_mmu(&tlb, mm, 0, -1);
2992        /* update_hiwater_rss(mm) here? but nobody should be looking */
2993        /* Use -1 here to ensure all VMAs in the mm are unmapped */
2994        unmap_vmas(&tlb, vma, 0, -1);
2995
2996        free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
2997        tlb_finish_mmu(&tlb, 0, -1);
2998
2999        /*
3000         * Walk the list again, actually closing and freeing it,

3001         * with preemption enabled, without holding any MM locks.
3002         */
3003        while (vma) {
3004                if (vma->vm_flags & VM_ACCOUNT)
3005                        nr_accounted += vma_pages(vma);
3006                vma = remove_vma(vma);
3007        }
3008        vm_unacct_memory(nr_accounted);
3009}
3010
3011/* Insert vm structure into process list sorted by address
3012 * and into the inode's i_mmap tree.  If vm_file is non-NULL
3013 * then i_mmap_rwsem is taken here.
3014 */
3015int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
3016{
3017        struct vm_area_struct *prev;
3018        struct rb_node **rb_link, *rb_parent;
3019
3020        if (find_vma_links(mm, vma->vm_start, vma->vm_end,
3021                           &prev, &rb_link, &rb_parent))
3022                return -ENOMEM;
3023        if ((vma->vm_flags & VM_ACCOUNT) &&
3024             security_vm_enough_memory_mm(mm, vma_pages(vma)))
3025                return -ENOMEM;
3026
3027        /*
3028         * The vm_pgoff of a purely anonymous vma should be irrelevant
3029         * until its first write fault, when page's anon_vma and index
3030         * are set.  But now set the vm_pgoff it will almost certainly
3031         * end up with (unless mremap moves it elsewhere before that
3032         * first wfault), so /proc/pid/maps tells a consistent story.
3033         *
3034         * By setting it to reflect the virtual start address of the
3035         * vma, merges and splits can happen in a seamless way, just
3036         * using the existing file pgoff checks and manipulations.
3037         * Similarly in do_mmap_pgoff and in do_brk.
3038         */
3039        if (vma_is_anonymous(vma)) {
3040                BUG_ON(vma->anon_vma);
3041                vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
3042        }
3043
3044        vma_link(mm, vma, prev, rb_link, rb_parent);
3045        return 0;
3046}
3047
3048/*
3049 * Copy the vma structure to a new location in the same mm,
3050 * prior to moving page table entries, to effect an mremap move.
3051 */
3052struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
3053        unsigned long addr, unsigned long len, pgoff_t pgoff,
3054        bool *need_rmap_locks)
3055{
3056        struct vm_area_struct *vma = *vmap;
3057        unsigned long vma_start = vma->vm_start;
3058        struct mm_struct *mm = vma->vm_mm;
3059        struct vm_area_struct *new_vma, *prev;
3060        struct rb_node **rb_link, *rb_parent;
3061        bool faulted_in_anon_vma = true;
3062
3063        /*
3064         * If anonymous vma has not yet been faulted, update new pgoff
3065         * to match new location, to increase its chance of merging.
3066         */
3067        if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) {
3068                pgoff = addr >> PAGE_SHIFT;
3069                faulted_in_anon_vma = false;
3070        }
3071
3072        if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
3073                return NULL;    /* should never get here */
3074        new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
3075                            vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
3076                            vma->vm_userfaultfd_ctx);
3077        if (new_vma) {
3078                /*
3079                 * Source vma may have been merged into new_vma
3080                 */
3081                if (unlikely(vma_start >= new_vma->vm_start &&
3082                             vma_start < new_vma->vm_end)) {
3083                        /*
3084                         * The only way we can get a vma_merge with
3085                         * self during an mremap is if the vma hasn't
3086                         * been faulted in yet and we were allowed to
3087                         * reset the dst vma->vm_pgoff to the
3088                         * destination address of the mremap to allow
3089                         * the merge to happen. mremap must change the
3090                         * vm_pgoff linearity between src and dst vmas
3091                         * (in turn preventing a vma_merge) to be
3092                         * safe. It is only safe to keep the vm_pgoff
3093                         * linear if there are no pages mapped yet.
3094                         */
3095                        VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma);
3096                        *vmap = vma = new_vma;
3097                }
3098                *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
3099        } else {
3100                new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
3101                if (!new_vma)
3102                        goto out;
3103                *new_vma = *vma;
3104                new_vma->vm_start = addr;
3105                new_vma->vm_end = addr + len;
3106                new_vma->vm_pgoff = pgoff;
3107                if (vma_dup_policy(vma, new_vma))
3108                        goto out_free_vma;
3109                INIT_LIST_HEAD(&new_vma->anon_vma_chain);
3110                if (anon_vma_clone(new_vma, vma))
3111                        goto out_free_mempol;
3112                if (new_vma->vm_file)
3113                        get_file(new_vma->vm_file);
3114                if (new_vma->vm_ops && new_vma->vm_ops->open)
3115                        new_vma->vm_ops->open(new_vma);
3116                vma_link(mm, new_vma, prev, rb_link, rb_parent);
3117                *need_rmap_locks = false;
3118        }
3119        return new_vma;
3120
3121out_free_mempol:
3122        mpol_put(vma_policy(new_vma));
3123out_free_vma:
3124        kmem_cache_free(vm_area_cachep, new_vma);
3125out:
3126        return NULL;
3127}
3128
3129/*
3130 * Return true if the calling process may expand its vm space by the passed
3131 * number of pages
3132 */
3133bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages)
3134{
3135        if (mm->total_vm + npages > rlimit(RLIMIT_AS) >> PAGE_SHIFT)
3136                return false;
3137
3138        if (is_data_mapping(flags) &&
3139            mm->data_vm + npages > rlimit(RLIMIT_DATA) >> PAGE_SHIFT) {
3140                /* Workaround for Valgrind */
3141                if (rlimit(RLIMIT_DATA) == 0 &&
3142                    mm->data_vm + npages <= rlimit_max(RLIMIT_DATA) >> PAGE_SHIFT)
3143                        return true;
3144                if (!ignore_rlimit_data) {
3145                        pr_warn_once("%s (%d): VmData %lu exceed data ulimit %lu. Update limits or use boot option ignore_rlimit_data.\n",
3146                                     current->comm, current->pid,
3147                                     (mm->data_vm + npages) << PAGE_SHIFT,
3148                                     rlimit(RLIMIT_DATA));
3149                        return false;
3150                }
3151        }
3152
3153        return true;
3154}
3155
3156void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages)
3157{
3158        mm->total_vm += npages;
3159
3160        if (is_exec_mapping(flags))
3161                mm->exec_vm += npages;
3162        else if (is_stack_mapping(flags))
3163                mm->stack_vm += npages;
3164        else if (is_data_mapping(flags))
3165                mm->data_vm += npages;
3166}
3167
3168static int special_mapping_fault(struct vm_fault *vmf);
3169
3170/*
3171 * Having a close hook prevents vma merging regardless of flags.
3172 */
3173static void special_mapping_close(struct vm_area_struct *vma)
3174{
3175}
3176
3177static const char *special_mapping_name(struct vm_area_struct *vma)
3178{
3179        return ((struct vm_special_mapping *)vma->vm_private_data)->name;
3180}
3181
3182static int special_mapping_mremap(struct vm_area_struct *new_vma)
3183{
3184        struct vm_special_mapping *sm = new_vma->vm_private_data;
3185
3186        if (WARN_ON_ONCE(current->mm != new_vma->vm_mm))
3187                return -EFAULT;
3188
3189        if (sm->mremap)
3190                return sm->mremap(sm, new_vma);
3191
3192        return 0;
3193}
3194
3195static const struct vm_operations_struct special_mapping_vmops = {
3196        .close = special_mapping_close,
3197        .fault = special_mapping_fault,
3198        .mremap = special_mapping_mremap,
3199        .name = special_mapping_name,
3200};
3201
3202static const struct vm_operations_struct legacy_special_mapping_vmops = {
3203        .close = special_mapping_close,
3204        .fault = special_mapping_fault,
3205};
3206
3207static int special_mapping_fault(struct vm_fault *vmf)
3208{
3209        struct vm_area_struct *vma = vmf->vma;
3210        pgoff_t pgoff;
3211        struct page **pages;
3212
3213        if (vma->vm_ops == &legacy_special_mapping_vmops) {
3214                pages = vma->vm_private_data;
3215        } else {
3216                struct vm_special_mapping *sm = vma->vm_private_data;
3217
3218                if (sm->fault)
3219                        return sm->fault(sm, vmf->vma, vmf);
3220
3221                pages = sm->pages;
3222        }
3223
3224        for (pgoff = vmf->pgoff; pgoff && *pages; ++pages)
3225                pgoff--;
3226
3227        if (*pages) {
3228                struct page *page = *pages;
3229                get_page(page);
3230                vmf->page = page;
3231                return 0;
3232        }
3233
3234        return VM_FAULT_SIGBUS;
3235}
3236
3237static struct vm_area_struct *__install_special_mapping(
3238        struct mm_struct *mm,
3239        unsigned long addr, unsigned long len,
3240        unsigned long vm_flags, void *priv,
3241        const struct vm_operations_struct *ops)
3242{
3243        int ret;
3244        struct vm_area_struct *vma;
3245
3246        vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
3247        if (unlikely(vma == NULL))
3248                return ERR_PTR(-ENOMEM);
3249
3250        INIT_LIST_HEAD(&vma->anon_vma_chain);
3251        vma->vm_mm = mm;
3252        vma->vm_start = addr;
3253        vma->vm_end = addr + len;
3254
3255        vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY;
3256        vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
3257
3258        vma->vm_ops = ops;
3259        vma->vm_private_data = priv;
3260
3261        ret = insert_vm_struct(mm, vma);
3262        if (ret)
3263                goto out;
3264
3265        vm_stat_account(mm, vma->vm_flags, len >> PAGE_SHIFT);
3266
3267        perf_event_mmap(vma);
3268
3269        return vma;
3270
3271out:
3272        kmem_cache_free(vm_area_cachep, vma);
3273        return ERR_PTR(ret);
3274}
3275
3276bool vma_is_special_mapping(const struct vm_area_struct *vma,
3277        const struct vm_special_mapping *sm)
3278{
3279        return vma->vm_private_data == sm &&
3280                (vma->vm_ops == &special_mapping_vmops ||
3281                 vma->vm_ops == &legacy_special_mapping_vmops);
3282}
3283
3284/*
3285 * Called with mm->mmap_sem held for writing.
3286 * Insert a new vma covering the given region, with the given flags.
3287 * Its pages are supplied by the given array of struct page *.
3288 * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated.
3289 * The region past the last page supplied will always produce SIGBUS.
3290 * The array pointer and the pages it points to are assumed to stay alive
3291 * for as long as this mapping might exist.
3292 */
3293struct vm_area_struct *_install_special_mapping(
3294        struct mm_struct *mm,
3295        unsigned long addr, unsigned long len,
3296        unsigned long vm_flags, const struct vm_special_mapping *spec)
3297{
3298        return __install_special_mapping(mm, addr, len, vm_flags, (void *)spec,
3299                                        &special_mapping_vmops);
3300}
3301
3302int install_special_mapping(struct mm_struct *mm,
3303                            unsigned long addr, unsigned long len,
3304                            unsigned long vm_flags, struct page **pages)
3305{
3306        struct vm_area_struct *vma = __install_special_mapping(
3307                mm, addr, len, vm_flags, (void *)pages,
3308                &legacy_special_mapping_vmops);
3309
3310        return PTR_ERR_OR_ZERO(vma);
3311}
3312
3313static DEFINE_MUTEX(mm_all_locks_mutex);
3314
3315static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
3316{
3317        if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) {
3318                /*
3319                 * The LSB of head.next can't change from under us
3320                 * because we hold the mm_all_locks_mutex.
3321                 */
3322                down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_sem);
3323                /*
3324                 * We can safely modify head.next after taking the
3325                 * anon_vma->root->rwsem. If some other vma in this mm shares
3326                 * the same anon_vma we won't take it again.
3327                 *
3328                 * No need of atomic instructions here, head.next
3329                 * can't change from under us thanks to the
3330                 * anon_vma->root->rwsem.
3331                 */
3332                if (__test_and_set_bit(0, (unsigned long *)
3333                                       &anon_vma->root->rb_root.rb_node))
3334                        BUG();
3335        }
3336}
3337
3338static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
3339{
3340        if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
3341                /*
3342                 * AS_MM_ALL_LOCKS can't change from under us because
3343                 * we hold the mm_all_locks_mutex.
3344                 *
3345                 * Operations on ->flags have to be atomic because
3346                 * even if AS_MM_ALL_LOCKS is stable thanks to the
3347                 * mm_all_locks_mutex, there may be other cpus
3348                 * changing other bitflags in parallel to us.
3349                 */
3350                if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
3351                        BUG();
3352                down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_sem);
3353        }
3354}
3355
3356/*
3357 * This operation locks against the VM for all pte/vma/mm related
3358 * operations that could ever happen on a certain mm. This includes
3359 * vmtruncate, try_to_unmap, and all page faults.
3360 *
3361 * The caller must take the mmap_sem in write mode before calling
3362 * mm_take_all_locks(). The caller isn't allowed to release the
3363 * mmap_sem until mm_drop_all_locks() returns.
3364 *
3365 * mmap_sem in write mode is required in order to block all operations
3366 * that could modify pagetables and free pages without need of
3367 * altering the vma layout. It's also needed in write mode to avoid new
3368 * anon_vmas to be associated with existing vmas.
3369 *
3370 * A single task can't take more than one mm_take_all_locks() in a row
3371 * or it would deadlock.
3372 *
3373 * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in
3374 * mapping->flags avoid to take the same lock twice, if more than one
3375 * vma in this mm is backed by the same anon_vma or address_space.
3376 *
3377 * We take locks in following order, accordingly to comment at beginning
3378 * of mm/rmap.c:
3379 *   - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for
3380 *     hugetlb mapping);
3381 *   - all i_mmap_rwsem locks;
3382 *   - all anon_vma->rwseml
3383 *
3384 * We can take all locks within these types randomly because the VM code
3385 * doesn't nest them and we protected from parallel mm_take_all_locks() by
3386 * mm_all_locks_mutex.
3387 *
3388 * mm_take_all_locks() and mm_drop_all_locks are expensive operations
3389 * that may have to take thousand of locks.
3390 *
3391 * mm_take_all_locks() can fail if it's interrupted by signals.
3392 */
3393int mm_take_all_locks(struct mm_struct *mm)
3394{
3395        struct vm_area_struct *vma;
3396        struct anon_vma_chain *avc;
3397
3398        BUG_ON(down_read_trylock(&mm->mmap_sem));
3399
3400        mutex_lock(&mm_all_locks_mutex);
3401
3402        for (vma = mm->mmap; vma; vma = vma->vm_next) {
3403                if (signal_pending(current))
3404                        goto out_unlock;
3405                if (vma->vm_file && vma->vm_file->f_mapping &&
3406                                is_vm_hugetlb_page(vma))
3407                        vm_lock_mapping(mm, vma->vm_file->f_mapping);
3408        }
3409
3410        for (vma = mm->mmap; vma; vma = vma->vm_next) {
3411                if (signal_pending(current))
3412                        goto out_unlock;
3413                if (vma->vm_file && vma->vm_file->f_mapping &&
3414                                !is_vm_hugetlb_page(vma))
3415                        vm_lock_mapping(mm, vma->vm_file->f_mapping);
3416        }
3417
3418        for (vma = mm->mmap; vma; vma = vma->vm_next) {
3419                if (signal_pending(current))
3420                        goto out_unlock;
3421                if (vma->anon_vma)
3422                        list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
3423                                vm_lock_anon_vma(mm, avc->anon_vma);
3424        }
3425
3426        return 0;
3427
3428out_unlock:
3429        mm_drop_all_locks(mm);
3430        return -EINTR;
3431}
3432
3433static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
3434{
3435        if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) {
3436                /*
3437                 * The LSB of head.next can't change to 0 from under
3438                 * us because we hold the mm_all_locks_mutex.
3439                 *
3440                 * We must however clear the bitflag before unlocking
3441                 * the vma so the users using the anon_vma->rb_root will
3442                 * never see our bitflag.
3443                 *
3444                 * No need of atomic instructions here, head.next
3445                 * can't change from under us until we release the
3446                 * anon_vma->root->rwsem.
3447                 */
3448                if (!__test_and_clear_bit(0, (unsigned long *)
3449                                          &anon_vma->root->rb_root.rb_node))
3450                        BUG();
3451                anon_vma_unlock_write(anon_vma);
3452        }
3453}
3454
3455static void vm_unlock_mapping(struct address_space *mapping)
3456{
3457        if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
3458                /*
3459                 * AS_MM_ALL_LOCKS can't change to 0 from under us
3460                 * because we hold the mm_all_locks_mutex.
3461                 */
3462                i_mmap_unlock_write(mapping);
3463                if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
3464                                        &mapping->flags))
3465                        BUG();
3466        }
3467}
3468
3469/*
3470 * The mmap_sem cannot be released by the caller until
3471 * mm_drop_all_locks() returns.
3472 */
3473void mm_drop_all_locks(struct mm_struct *mm)
3474{
3475        struct vm_area_struct *vma;
3476        struct anon_vma_chain *avc;
3477
3478        BUG_ON(down_read_trylock(&mm->mmap_sem));
3479        BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
3480
3481        for (vma = mm->mmap; vma; vma = vma->vm_next) {
3482                if (vma->anon_vma)
3483                        list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
3484                                vm_unlock_anon_vma(avc->anon_vma);
3485                if (vma->vm_file && vma->vm_file->f_mapping)
3486                        vm_unlock_mapping(vma->vm_file->f_mapping);
3487        }
3488
3489        mutex_unlock(&mm_all_locks_mutex);
3490}
3491
3492/*
3493 * initialise the percpu counter for VM
3494 */
3495void __init mmap_init(void)
3496{
3497        int ret;
3498
3499        ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL);
3500        VM_BUG_ON(ret);
3501}
3502
3503/*
3504 * Initialise sysctl_user_reserve_kbytes.
3505 *
3506 * This is intended to prevent a user from starting a single memory hogging
3507 * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER
3508 * mode.
3509 *
3510 * The default value is min(3% of free memory, 128MB)
3511 * 128MB is enough to recover with sshd/login, bash, and top/kill.
3512 */
3513static int init_user_reserve(void)
3514{
3515        unsigned long free_kbytes;
3516
3517        free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
3518
3519        sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
3520        return 0;
3521}
3522subsys_initcall(init_user_reserve);
3523
3524/*
3525 * Initialise sysctl_admin_reserve_kbytes.
3526 *
3527 * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin
3528 * to log in and kill a memory hogging process.
3529 *
3530 * Systems with more than 256MB will reserve 8MB, enough to recover
3531 * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will
3532 * only reserve 3% of free pages by default.
3533 */
3534static int init_admin_reserve(void)
3535{
3536        unsigned long free_kbytes;
3537
3538        free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
3539
3540        sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
3541        return 0;
3542}
3543subsys_initcall(init_admin_reserve);
3544
3545/*
3546 * Reinititalise user and admin reserves if memory is added or removed.
3547 *
3548 * The default user reserve max is 128MB, and the default max for the
3549 * admin reserve is 8MB. These are usually, but not always, enough to
3550 * enable recovery from a memory hogging process using login/sshd, a shell,
3551 * and tools like top. It may make sense to increase or even disable the
3552 * reserve depending on the existence of swap or variations in the recovery
3553 * tools. So, the admin may have changed them.
3554 *
3555 * If memory is added and the reserves have been eliminated or increased above
3556 * the default max, then we'll trust the admin.
3557 *
3558 * If memory is removed and there isn't enough free memory, then we
3559 * need to reset the reserves.
3560 *
3561 * Otherwise keep the reserve set by the admin.
3562 */
3563static int reserve_mem_notifier(struct notifier_block *nb,
3564                             unsigned long action, void *data)
3565{
3566        unsigned long tmp, free_kbytes;
3567
3568        switch (action) {
3569        case MEM_ONLINE:
3570                /* Default max is 128MB. Leave alone if modified by operator. */
3571                tmp = sysctl_user_reserve_kbytes;
3572                if (0 < tmp && tmp < (1UL << 17))
3573                        init_user_reserve();
3574
3575                /* Default max is 8MB.  Leave alone if modified by operator. */
3576                tmp = sysctl_admin_reserve_kbytes;
3577                if (0 < tmp && tmp < (1UL << 13))
3578                        init_admin_reserve();
3579
3580                break;
3581        case MEM_OFFLINE:
3582                free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
3583
3584                if (sysctl_user_reserve_kbytes > free_kbytes) {
3585                        init_user_reserve();
3586                        pr_info("vm.user_reserve_kbytes reset to %lu\n",
3587                                sysctl_user_reserve_kbytes);
3588                }
3589
3590                if (sysctl_admin_reserve_kbytes > free_kbytes) {
3591                        init_admin_reserve();
3592                        pr_info("vm.admin_reserve_kbytes reset to %lu\n",
3593                                sysctl_admin_reserve_kbytes);
3594                }
3595                break;
3596        default:
3597                break;
3598        }
3599        return NOTIFY_OK;
3600}
3601
3602static struct notifier_block reserve_mem_nb = {
3603        .notifier_call = reserve_mem_notifier,
3604};
3605
3606static int __meminit init_reserve_notifier(void)
3607{
3608        if (register_hotmemory_notifier(&reserve_mem_nb))
3609                pr_err("Failed registering memory add/remove notifier for admin reserve\n");
3610
3611        return 0;
3612}
3613subsys_initcall(init_reserve_notifier);
3614