linux/mm/mremap.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 *      mm/mremap.c
   4 *
   5 *      (C) Copyright 1996 Linus Torvalds
   6 *
   7 *      Address space accounting code   <alan@lxorguk.ukuu.org.uk>
   8 *      (C) Copyright 2002 Red Hat Inc, All Rights Reserved
   9 */
  10
  11#include <linux/mm.h>
  12#include <linux/hugetlb.h>
  13#include <linux/shm.h>
  14#include <linux/ksm.h>
  15#include <linux/mman.h>
  16#include <linux/swap.h>
  17#include <linux/capability.h>
  18#include <linux/fs.h>
  19#include <linux/swapops.h>
  20#include <linux/highmem.h>
  21#include <linux/security.h>
  22#include <linux/syscalls.h>
  23#include <linux/mmu_notifier.h>
  24#include <linux/uaccess.h>
  25#include <linux/mm-arch-hooks.h>
  26#include <linux/userfaultfd_k.h>
  27
  28#include <asm/cacheflush.h>
  29#include <asm/tlbflush.h>
  30
  31#include "internal.h"
  32
  33static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
  34{
  35        pgd_t *pgd;
  36        p4d_t *p4d;
  37        pud_t *pud;
  38        pmd_t *pmd;
  39
  40        pgd = pgd_offset(mm, addr);
  41        if (pgd_none_or_clear_bad(pgd))
  42                return NULL;
  43
  44        p4d = p4d_offset(pgd, addr);
  45        if (p4d_none_or_clear_bad(p4d))
  46                return NULL;
  47
  48        pud = pud_offset(p4d, addr);
  49        if (pud_none_or_clear_bad(pud))
  50                return NULL;
  51
  52        pmd = pmd_offset(pud, addr);
  53        if (pmd_none(*pmd))
  54                return NULL;
  55
  56        return pmd;
  57}
  58
  59static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
  60                            unsigned long addr)
  61{
  62        pgd_t *pgd;
  63        p4d_t *p4d;
  64        pud_t *pud;
  65        pmd_t *pmd;
  66
  67        pgd = pgd_offset(mm, addr);
  68        p4d = p4d_alloc(mm, pgd, addr);
  69        if (!p4d)
  70                return NULL;
  71        pud = pud_alloc(mm, p4d, addr);
  72        if (!pud)
  73                return NULL;
  74
  75        pmd = pmd_alloc(mm, pud, addr);
  76        if (!pmd)
  77                return NULL;
  78
  79        VM_BUG_ON(pmd_trans_huge(*pmd));
  80
  81        return pmd;
  82}
  83
  84static void take_rmap_locks(struct vm_area_struct *vma)
  85{
  86        if (vma->vm_file)
  87                i_mmap_lock_write(vma->vm_file->f_mapping);
  88        if (vma->anon_vma)
  89                anon_vma_lock_write(vma->anon_vma);
  90}
  91
  92static void drop_rmap_locks(struct vm_area_struct *vma)
  93{
  94        if (vma->anon_vma)
  95                anon_vma_unlock_write(vma->anon_vma);
  96        if (vma->vm_file)
  97                i_mmap_unlock_write(vma->vm_file->f_mapping);
  98}
  99
 100static pte_t move_soft_dirty_pte(pte_t pte)
 101{
 102        /*
 103         * Set soft dirty bit so we can notice
 104         * in userspace the ptes were moved.
 105         */
 106#ifdef CONFIG_MEM_SOFT_DIRTY
 107        if (pte_present(pte))
 108                pte = pte_mksoft_dirty(pte);
 109        else if (is_swap_pte(pte))
 110                pte = pte_swp_mksoft_dirty(pte);
 111#endif
 112        return pte;
 113}
 114
 115static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 116                unsigned long old_addr, unsigned long old_end,
 117                struct vm_area_struct *new_vma, pmd_t *new_pmd,
 118                unsigned long new_addr, bool need_rmap_locks)
 119{
 120        struct mm_struct *mm = vma->vm_mm;
 121        pte_t *old_pte, *new_pte, pte;
 122        spinlock_t *old_ptl, *new_ptl;
 123        bool force_flush = false;
 124        unsigned long len = old_end - old_addr;
 125
 126        /*
 127         * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma
 128         * locks to ensure that rmap will always observe either the old or the
 129         * new ptes. This is the easiest way to avoid races with
 130         * truncate_pagecache(), page migration, etc...
 131         *
 132         * When need_rmap_locks is false, we use other ways to avoid
 133         * such races:
 134         *
 135         * - During exec() shift_arg_pages(), we use a specially tagged vma
 136         *   which rmap call sites look for using vma_is_temporary_stack().
 137         *
 138         * - During mremap(), new_vma is often known to be placed after vma
 139         *   in rmap traversal order. This ensures rmap will always observe
 140         *   either the old pte, or the new pte, or both (the page table locks
 141         *   serialize access to individual ptes, but only rmap traversal
 142         *   order guarantees that we won't miss both the old and new ptes).
 143         */
 144        if (need_rmap_locks)
 145                take_rmap_locks(vma);
 146
 147        /*
 148         * We don't have to worry about the ordering of src and dst
 149         * pte locks because exclusive mmap_lock prevents deadlock.
 150         */
 151        old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
 152        new_pte = pte_offset_map(new_pmd, new_addr);
 153        new_ptl = pte_lockptr(mm, new_pmd);
 154        if (new_ptl != old_ptl)
 155                spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
 156        flush_tlb_batched_pending(vma->vm_mm);
 157        arch_enter_lazy_mmu_mode();
 158
 159        for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
 160                                   new_pte++, new_addr += PAGE_SIZE) {
 161                if (pte_none(*old_pte))
 162                        continue;
 163
 164                pte = ptep_get_and_clear(mm, old_addr, old_pte);
 165                /*
 166                 * If we are remapping a valid PTE, make sure
 167                 * to flush TLB before we drop the PTL for the
 168                 * PTE.
 169                 *
 170                 * NOTE! Both old and new PTL matter: the old one
 171                 * for racing with page_mkclean(), the new one to
 172                 * make sure the physical page stays valid until
 173                 * the TLB entry for the old mapping has been
 174                 * flushed.
 175                 */
 176                if (pte_present(pte))
 177                        force_flush = true;
 178                pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
 179                pte = move_soft_dirty_pte(pte);
 180                set_pte_at(mm, new_addr, new_pte, pte);
 181        }
 182
 183        arch_leave_lazy_mmu_mode();
 184        if (force_flush)
 185                flush_tlb_range(vma, old_end - len, old_end);
 186        if (new_ptl != old_ptl)
 187                spin_unlock(new_ptl);
 188        pte_unmap(new_pte - 1);
 189        pte_unmap_unlock(old_pte - 1, old_ptl);
 190        if (need_rmap_locks)
 191                drop_rmap_locks(vma);
 192}
 193
 194#ifdef CONFIG_HAVE_MOVE_PMD
 195static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 196                  unsigned long new_addr, unsigned long old_end,
 197                  pmd_t *old_pmd, pmd_t *new_pmd)
 198{
 199        spinlock_t *old_ptl, *new_ptl;
 200        struct mm_struct *mm = vma->vm_mm;
 201        pmd_t pmd;
 202
 203        if ((old_addr & ~PMD_MASK) || (new_addr & ~PMD_MASK)
 204            || old_end - old_addr < PMD_SIZE)
 205                return false;
 206
 207        /*
 208         * The destination pmd shouldn't be established, free_pgtables()
 209         * should have released it.
 210         *
 211         * However, there's a case during execve() where we use mremap
 212         * to move the initial stack, and in that case the target area
 213         * may overlap the source area (always moving down).
 214         *
 215         * If everything is PMD-aligned, that works fine, as moving
 216         * each pmd down will clear the source pmd. But if we first
 217         * have a few 4kB-only pages that get moved down, and then
 218         * hit the "now the rest is PMD-aligned, let's do everything
 219         * one pmd at a time", we will still have the old (now empty
 220         * of any 4kB pages, but still there) PMD in the page table
 221         * tree.
 222         *
 223         * Warn on it once - because we really should try to figure
 224         * out how to do this better - but then say "I won't move
 225         * this pmd".
 226         *
 227         * One alternative might be to just unmap the target pmd at
 228         * this point, and verify that it really is empty. We'll see.
 229         */
 230        if (WARN_ON_ONCE(!pmd_none(*new_pmd)))
 231                return false;
 232
 233        /*
 234         * We don't have to worry about the ordering of src and dst
 235         * ptlocks because exclusive mmap_lock prevents deadlock.
 236         */
 237        old_ptl = pmd_lock(vma->vm_mm, old_pmd);
 238        new_ptl = pmd_lockptr(mm, new_pmd);
 239        if (new_ptl != old_ptl)
 240                spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
 241
 242        /* Clear the pmd */
 243        pmd = *old_pmd;
 244        pmd_clear(old_pmd);
 245
 246        VM_BUG_ON(!pmd_none(*new_pmd));
 247
 248        /* Set the new pmd */
 249        set_pmd_at(mm, new_addr, new_pmd, pmd);
 250        flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
 251        if (new_ptl != old_ptl)
 252                spin_unlock(new_ptl);
 253        spin_unlock(old_ptl);
 254
 255        return true;
 256}
 257#endif
 258
 259unsigned long move_page_tables(struct vm_area_struct *vma,
 260                unsigned long old_addr, struct vm_area_struct *new_vma,
 261                unsigned long new_addr, unsigned long len,
 262                bool need_rmap_locks)
 263{
 264        unsigned long extent, next, old_end;
 265        struct mmu_notifier_range range;
 266        pmd_t *old_pmd, *new_pmd;
 267
 268        old_end = old_addr + len;
 269        flush_cache_range(vma, old_addr, old_end);
 270
 271        mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
 272                                old_addr, old_end);
 273        mmu_notifier_invalidate_range_start(&range);
 274
 275        for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
 276                cond_resched();
 277                next = (old_addr + PMD_SIZE) & PMD_MASK;
 278                /* even if next overflowed, extent below will be ok */
 279                extent = next - old_addr;
 280                if (extent > old_end - old_addr)
 281                        extent = old_end - old_addr;
 282                old_pmd = get_old_pmd(vma->vm_mm, old_addr);
 283                if (!old_pmd)
 284                        continue;
 285                new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
 286                if (!new_pmd)
 287                        break;
 288                if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd) || pmd_devmap(*old_pmd)) {
 289                        if (extent == HPAGE_PMD_SIZE) {
 290                                bool moved;
 291                                /* See comment in move_ptes() */
 292                                if (need_rmap_locks)
 293                                        take_rmap_locks(vma);
 294                                moved = move_huge_pmd(vma, old_addr, new_addr,
 295                                                    old_end, old_pmd, new_pmd);
 296                                if (need_rmap_locks)
 297                                        drop_rmap_locks(vma);
 298                                if (moved)
 299                                        continue;
 300                        }
 301                        split_huge_pmd(vma, old_pmd, old_addr);
 302                        if (pmd_trans_unstable(old_pmd))
 303                                continue;
 304                } else if (extent == PMD_SIZE) {
 305#ifdef CONFIG_HAVE_MOVE_PMD
 306                        /*
 307                         * If the extent is PMD-sized, try to speed the move by
 308                         * moving at the PMD level if possible.
 309                         */
 310                        bool moved;
 311
 312                        if (need_rmap_locks)
 313                                take_rmap_locks(vma);
 314                        moved = move_normal_pmd(vma, old_addr, new_addr,
 315                                        old_end, old_pmd, new_pmd);
 316                        if (need_rmap_locks)
 317                                drop_rmap_locks(vma);
 318                        if (moved)
 319                                continue;
 320#endif
 321                }
 322
 323                if (pte_alloc(new_vma->vm_mm, new_pmd))
 324                        break;
 325                next = (new_addr + PMD_SIZE) & PMD_MASK;
 326                if (extent > next - new_addr)
 327                        extent = next - new_addr;
 328                move_ptes(vma, old_pmd, old_addr, old_addr + extent, new_vma,
 329                          new_pmd, new_addr, need_rmap_locks);
 330        }
 331
 332        mmu_notifier_invalidate_range_end(&range);
 333
 334        return len + old_addr - old_end;        /* how much done */
 335}
 336
 337static unsigned long move_vma(struct vm_area_struct *vma,
 338                unsigned long old_addr, unsigned long old_len,
 339                unsigned long new_len, unsigned long new_addr,
 340                bool *locked, unsigned long flags,
 341                struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap)
 342{
 343        struct mm_struct *mm = vma->vm_mm;
 344        struct vm_area_struct *new_vma;
 345        unsigned long vm_flags = vma->vm_flags;
 346        unsigned long new_pgoff;
 347        unsigned long moved_len;
 348        unsigned long excess = 0;
 349        unsigned long hiwater_vm;
 350        int split = 0;
 351        int err;
 352        bool need_rmap_locks;
 353
 354        /*
 355         * We'd prefer to avoid failure later on in do_munmap:
 356         * which may split one vma into three before unmapping.
 357         */
 358        if (mm->map_count >= sysctl_max_map_count - 3)
 359                return -ENOMEM;
 360
 361        /*
 362         * Advise KSM to break any KSM pages in the area to be moved:
 363         * it would be confusing if they were to turn up at the new
 364         * location, where they happen to coincide with different KSM
 365         * pages recently unmapped.  But leave vma->vm_flags as it was,
 366         * so KSM can come around to merge on vma and new_vma afterwards.
 367         */
 368        err = ksm_madvise(vma, old_addr, old_addr + old_len,
 369                                                MADV_UNMERGEABLE, &vm_flags);
 370        if (err)
 371                return err;
 372
 373        new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
 374        new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
 375                           &need_rmap_locks);
 376        if (!new_vma)
 377                return -ENOMEM;
 378
 379        moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
 380                                     need_rmap_locks);
 381        if (moved_len < old_len) {
 382                err = -ENOMEM;
 383        } else if (vma->vm_ops && vma->vm_ops->mremap) {
 384                err = vma->vm_ops->mremap(new_vma);
 385        }
 386
 387        if (unlikely(err)) {
 388                /*
 389                 * On error, move entries back from new area to old,
 390                 * which will succeed since page tables still there,
 391                 * and then proceed to unmap new area instead of old.
 392                 */
 393                move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
 394                                 true);
 395                vma = new_vma;
 396                old_len = new_len;
 397                old_addr = new_addr;
 398                new_addr = err;
 399        } else {
 400                mremap_userfaultfd_prep(new_vma, uf);
 401                arch_remap(mm, old_addr, old_addr + old_len,
 402                           new_addr, new_addr + new_len);
 403        }
 404
 405        /* Conceal VM_ACCOUNT so old reservation is not undone */
 406        if (vm_flags & VM_ACCOUNT) {
 407                vma->vm_flags &= ~VM_ACCOUNT;
 408                excess = vma->vm_end - vma->vm_start - old_len;
 409                if (old_addr > vma->vm_start &&
 410                    old_addr + old_len < vma->vm_end)
 411                        split = 1;
 412        }
 413
 414        /*
 415         * If we failed to move page tables we still do total_vm increment
 416         * since do_munmap() will decrement it by old_len == new_len.
 417         *
 418         * Since total_vm is about to be raised artificially high for a
 419         * moment, we need to restore high watermark afterwards: if stats
 420         * are taken meanwhile, total_vm and hiwater_vm appear too high.
 421         * If this were a serious issue, we'd add a flag to do_munmap().
 422         */
 423        hiwater_vm = mm->hiwater_vm;
 424        vm_stat_account(mm, vma->vm_flags, new_len >> PAGE_SHIFT);
 425
 426        /* Tell pfnmap has moved from this vma */
 427        if (unlikely(vma->vm_flags & VM_PFNMAP))
 428                untrack_pfn_moved(vma);
 429
 430        if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) {
 431                if (vm_flags & VM_ACCOUNT) {
 432                        /* Always put back VM_ACCOUNT since we won't unmap */
 433                        vma->vm_flags |= VM_ACCOUNT;
 434
 435                        vm_acct_memory(new_len >> PAGE_SHIFT);
 436                }
 437
 438                /*
 439                 * VMAs can actually be merged back together in copy_vma
 440                 * calling merge_vma. This can happen with anonymous vmas
 441                 * which have not yet been faulted, so if we were to consider
 442                 * this VMA split we'll end up adding VM_ACCOUNT on the
 443                 * next VMA, which is completely unrelated if this VMA
 444                 * was re-merged.
 445                 */
 446                if (split && new_vma == vma)
 447                        split = 0;
 448
 449                /* We always clear VM_LOCKED[ONFAULT] on the old vma */
 450                vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
 451
 452                /* Because we won't unmap we don't need to touch locked_vm */
 453                goto out;
 454        }
 455
 456        if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) {
 457                /* OOM: unable to split vma, just get accounts right */
 458                vm_unacct_memory(excess >> PAGE_SHIFT);
 459                excess = 0;
 460        }
 461
 462        if (vm_flags & VM_LOCKED) {
 463                mm->locked_vm += new_len >> PAGE_SHIFT;
 464                *locked = true;
 465        }
 466out:
 467        mm->hiwater_vm = hiwater_vm;
 468
 469        /* Restore VM_ACCOUNT if one or two pieces of vma left */
 470        if (excess) {
 471                vma->vm_flags |= VM_ACCOUNT;
 472                if (split)
 473                        vma->vm_next->vm_flags |= VM_ACCOUNT;
 474        }
 475
 476        return new_addr;
 477}
 478
 479static struct vm_area_struct *vma_to_resize(unsigned long addr,
 480        unsigned long old_len, unsigned long new_len, unsigned long flags,
 481        unsigned long *p)
 482{
 483        struct mm_struct *mm = current->mm;
 484        struct vm_area_struct *vma = find_vma(mm, addr);
 485        unsigned long pgoff;
 486
 487        if (!vma || vma->vm_start > addr)
 488                return ERR_PTR(-EFAULT);
 489
 490        /*
 491         * !old_len is a special case where an attempt is made to 'duplicate'
 492         * a mapping.  This makes no sense for private mappings as it will
 493         * instead create a fresh/new mapping unrelated to the original.  This
 494         * is contrary to the basic idea of mremap which creates new mappings
 495         * based on the original.  There are no known use cases for this
 496         * behavior.  As a result, fail such attempts.
 497         */
 498        if (!old_len && !(vma->vm_flags & (VM_SHARED | VM_MAYSHARE))) {
 499                pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap.  This is not supported.\n", current->comm, current->pid);
 500                return ERR_PTR(-EINVAL);
 501        }
 502
 503        if (flags & MREMAP_DONTUNMAP && (!vma_is_anonymous(vma) ||
 504                        vma->vm_flags & VM_SHARED))
 505                return ERR_PTR(-EINVAL);
 506
 507        if (is_vm_hugetlb_page(vma))
 508                return ERR_PTR(-EINVAL);
 509
 510        /* We can't remap across vm area boundaries */
 511        if (old_len > vma->vm_end - addr)
 512                return ERR_PTR(-EFAULT);
 513
 514        if (new_len == old_len)
 515                return vma;
 516
 517        /* Need to be careful about a growing mapping */
 518        pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
 519        pgoff += vma->vm_pgoff;
 520        if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
 521                return ERR_PTR(-EINVAL);
 522
 523        if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
 524                return ERR_PTR(-EFAULT);
 525
 526        if (vma->vm_flags & VM_LOCKED) {
 527                unsigned long locked, lock_limit;
 528                locked = mm->locked_vm << PAGE_SHIFT;
 529                lock_limit = rlimit(RLIMIT_MEMLOCK);
 530                locked += new_len - old_len;
 531                if (locked > lock_limit && !capable(CAP_IPC_LOCK))
 532                        return ERR_PTR(-EAGAIN);
 533        }
 534
 535        if (!may_expand_vm(mm, vma->vm_flags,
 536                                (new_len - old_len) >> PAGE_SHIFT))
 537                return ERR_PTR(-ENOMEM);
 538
 539        if (vma->vm_flags & VM_ACCOUNT) {
 540                unsigned long charged = (new_len - old_len) >> PAGE_SHIFT;
 541                if (security_vm_enough_memory_mm(mm, charged))
 542                        return ERR_PTR(-ENOMEM);
 543                *p = charged;
 544        }
 545
 546        return vma;
 547}
 548
 549static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
 550                unsigned long new_addr, unsigned long new_len, bool *locked,
 551                unsigned long flags, struct vm_userfaultfd_ctx *uf,
 552                struct list_head *uf_unmap_early,
 553                struct list_head *uf_unmap)
 554{
 555        struct mm_struct *mm = current->mm;
 556        struct vm_area_struct *vma;
 557        unsigned long ret = -EINVAL;
 558        unsigned long charged = 0;
 559        unsigned long map_flags = 0;
 560
 561        if (offset_in_page(new_addr))
 562                goto out;
 563
 564        if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
 565                goto out;
 566
 567        /* Ensure the old/new locations do not overlap */
 568        if (addr + old_len > new_addr && new_addr + new_len > addr)
 569                goto out;
 570
 571        /*
 572         * move_vma() need us to stay 4 maps below the threshold, otherwise
 573         * it will bail out at the very beginning.
 574         * That is a problem if we have already unmaped the regions here
 575         * (new_addr, and old_addr), because userspace will not know the
 576         * state of the vma's after it gets -ENOMEM.
 577         * So, to avoid such scenario we can pre-compute if the whole
 578         * operation has high chances to success map-wise.
 579         * Worst-scenario case is when both vma's (new_addr and old_addr) get
 580         * split in 3 before unmaping it.
 581         * That means 2 more maps (1 for each) to the ones we already hold.
 582         * Check whether current map count plus 2 still leads us to 4 maps below
 583         * the threshold, otherwise return -ENOMEM here to be more safe.
 584         */
 585        if ((mm->map_count + 2) >= sysctl_max_map_count - 3)
 586                return -ENOMEM;
 587
 588        if (flags & MREMAP_FIXED) {
 589                ret = do_munmap(mm, new_addr, new_len, uf_unmap_early);
 590                if (ret)
 591                        goto out;
 592        }
 593
 594        if (old_len >= new_len) {
 595                ret = do_munmap(mm, addr+new_len, old_len - new_len, uf_unmap);
 596                if (ret && old_len != new_len)
 597                        goto out;
 598                old_len = new_len;
 599        }
 600
 601        vma = vma_to_resize(addr, old_len, new_len, flags, &charged);
 602        if (IS_ERR(vma)) {
 603                ret = PTR_ERR(vma);
 604                goto out;
 605        }
 606
 607        /* MREMAP_DONTUNMAP expands by old_len since old_len == new_len */
 608        if (flags & MREMAP_DONTUNMAP &&
 609                !may_expand_vm(mm, vma->vm_flags, old_len >> PAGE_SHIFT)) {
 610                ret = -ENOMEM;
 611                goto out;
 612        }
 613
 614        if (flags & MREMAP_FIXED)
 615                map_flags |= MAP_FIXED;
 616
 617        if (vma->vm_flags & VM_MAYSHARE)
 618                map_flags |= MAP_SHARED;
 619
 620        ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff +
 621                                ((addr - vma->vm_start) >> PAGE_SHIFT),
 622                                map_flags);
 623        if (IS_ERR_VALUE(ret))
 624                goto out1;
 625
 626        /* We got a new mapping */
 627        if (!(flags & MREMAP_FIXED))
 628                new_addr = ret;
 629
 630        ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, flags, uf,
 631                       uf_unmap);
 632
 633        if (!(offset_in_page(ret)))
 634                goto out;
 635
 636out1:
 637        vm_unacct_memory(charged);
 638
 639out:
 640        return ret;
 641}
 642
 643static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
 644{
 645        unsigned long end = vma->vm_end + delta;
 646        if (end < vma->vm_end) /* overflow */
 647                return 0;
 648        if (vma->vm_next && vma->vm_next->vm_start < end) /* intersection */
 649                return 0;
 650        if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start,
 651                              0, MAP_FIXED) & ~PAGE_MASK)
 652                return 0;
 653        return 1;
 654}
 655
 656/*
 657 * Expand (or shrink) an existing mapping, potentially moving it at the
 658 * same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
 659 *
 660 * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
 661 * This option implies MREMAP_MAYMOVE.
 662 */
 663SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 664                unsigned long, new_len, unsigned long, flags,
 665                unsigned long, new_addr)
 666{
 667        struct mm_struct *mm = current->mm;
 668        struct vm_area_struct *vma;
 669        unsigned long ret = -EINVAL;
 670        unsigned long charged = 0;
 671        bool locked = false;
 672        bool downgraded = false;
 673        struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
 674        LIST_HEAD(uf_unmap_early);
 675        LIST_HEAD(uf_unmap);
 676
 677        /*
 678         * There is a deliberate asymmetry here: we strip the pointer tag
 679         * from the old address but leave the new address alone. This is
 680         * for consistency with mmap(), where we prevent the creation of
 681         * aliasing mappings in userspace by leaving the tag bits of the
 682         * mapping address intact. A non-zero tag will cause the subsequent
 683         * range checks to reject the address as invalid.
 684         *
 685         * See Documentation/arm64/tagged-address-abi.rst for more information.
 686         */
 687        addr = untagged_addr(addr);
 688
 689        if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP))
 690                return ret;
 691
 692        if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
 693                return ret;
 694
 695        /*
 696         * MREMAP_DONTUNMAP is always a move and it does not allow resizing
 697         * in the process.
 698         */
 699        if (flags & MREMAP_DONTUNMAP &&
 700                        (!(flags & MREMAP_MAYMOVE) || old_len != new_len))
 701                return ret;
 702
 703
 704        if (offset_in_page(addr))
 705                return ret;
 706
 707        old_len = PAGE_ALIGN(old_len);
 708        new_len = PAGE_ALIGN(new_len);
 709
 710        /*
 711         * We allow a zero old-len as a special case
 712         * for DOS-emu "duplicate shm area" thing. But
 713         * a zero new-len is nonsensical.
 714         */
 715        if (!new_len)
 716                return ret;
 717
 718        if (mmap_write_lock_killable(current->mm))
 719                return -EINTR;
 720
 721        if (flags & (MREMAP_FIXED | MREMAP_DONTUNMAP)) {
 722                ret = mremap_to(addr, old_len, new_addr, new_len,
 723                                &locked, flags, &uf, &uf_unmap_early,
 724                                &uf_unmap);
 725                goto out;
 726        }
 727
 728        /*
 729         * Always allow a shrinking remap: that just unmaps
 730         * the unnecessary pages..
 731         * __do_munmap does all the needed commit accounting, and
 732         * downgrades mmap_lock to read if so directed.
 733         */
 734        if (old_len >= new_len) {
 735                int retval;
 736
 737                retval = __do_munmap(mm, addr+new_len, old_len - new_len,
 738                                  &uf_unmap, true);
 739                if (retval < 0 && old_len != new_len) {
 740                        ret = retval;
 741                        goto out;
 742                /* Returning 1 indicates mmap_lock is downgraded to read. */
 743                } else if (retval == 1)
 744                        downgraded = true;
 745                ret = addr;
 746                goto out;
 747        }
 748
 749        /*
 750         * Ok, we need to grow..
 751         */
 752        vma = vma_to_resize(addr, old_len, new_len, flags, &charged);
 753        if (IS_ERR(vma)) {
 754                ret = PTR_ERR(vma);
 755                goto out;
 756        }
 757
 758        /* old_len exactly to the end of the area..
 759         */
 760        if (old_len == vma->vm_end - addr) {
 761                /* can we just expand the current mapping? */
 762                if (vma_expandable(vma, new_len - old_len)) {
 763                        int pages = (new_len - old_len) >> PAGE_SHIFT;
 764
 765                        if (vma_adjust(vma, vma->vm_start, addr + new_len,
 766                                       vma->vm_pgoff, NULL)) {
 767                                ret = -ENOMEM;
 768                                goto out;
 769                        }
 770
 771                        vm_stat_account(mm, vma->vm_flags, pages);
 772                        if (vma->vm_flags & VM_LOCKED) {
 773                                mm->locked_vm += pages;
 774                                locked = true;
 775                                new_addr = addr;
 776                        }
 777                        ret = addr;
 778                        goto out;
 779                }
 780        }
 781
 782        /*
 783         * We weren't able to just expand or shrink the area,
 784         * we need to create a new one and move it..
 785         */
 786        ret = -ENOMEM;
 787        if (flags & MREMAP_MAYMOVE) {
 788                unsigned long map_flags = 0;
 789                if (vma->vm_flags & VM_MAYSHARE)
 790                        map_flags |= MAP_SHARED;
 791
 792                new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
 793                                        vma->vm_pgoff +
 794                                        ((addr - vma->vm_start) >> PAGE_SHIFT),
 795                                        map_flags);
 796                if (IS_ERR_VALUE(new_addr)) {
 797                        ret = new_addr;
 798                        goto out;
 799                }
 800
 801                ret = move_vma(vma, addr, old_len, new_len, new_addr,
 802                               &locked, flags, &uf, &uf_unmap);
 803        }
 804out:
 805        if (offset_in_page(ret)) {
 806                vm_unacct_memory(charged);
 807                locked = false;
 808        }
 809        if (downgraded)
 810                mmap_read_unlock(current->mm);
 811        else
 812                mmap_write_unlock(current->mm);
 813        if (locked && new_len > old_len)
 814                mm_populate(new_addr + old_len, new_len - old_len);
 815        userfaultfd_unmap_complete(mm, &uf_unmap_early);
 816        mremap_userfaultfd_complete(&uf, addr, ret, old_len);
 817        userfaultfd_unmap_complete(mm, &uf_unmap);
 818        return ret;
 819}
 820