linux/mm/mremap.c
<<
>>
Prefs
   1/*
   2 *      mm/mremap.c
   3 *
   4 *      (C) Copyright 1996 Linus Torvalds
   5 *
   6 *      Address space accounting code   <alan@lxorguk.ukuu.org.uk>
   7 *      (C) Copyright 2002 Red Hat Inc, All Rights Reserved
   8 */
   9
  10#include <linux/mm.h>
  11#include <linux/hugetlb.h>
  12#include <linux/shm.h>
  13#include <linux/ksm.h>
  14#include <linux/mman.h>
  15#include <linux/swap.h>
  16#include <linux/capability.h>
  17#include <linux/fs.h>
  18#include <linux/highmem.h>
  19#include <linux/security.h>
  20#include <linux/syscalls.h>
  21#include <linux/mmu_notifier.h>
  22
  23#include <asm/uaccess.h>
  24#include <asm/cacheflush.h>
  25#include <asm/tlbflush.h>
  26
  27#include "internal.h"
  28
  29static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
  30{
  31        pgd_t *pgd;
  32        pud_t *pud;
  33        pmd_t *pmd;
  34
  35        pgd = pgd_offset(mm, addr);
  36        if (pgd_none_or_clear_bad(pgd))
  37                return NULL;
  38
  39        pud = pud_offset(pgd, addr);
  40        if (pud_none_or_clear_bad(pud))
  41                return NULL;
  42
  43        pmd = pmd_offset(pud, addr);
  44        split_huge_page_pmd(mm, pmd);
  45        if (pmd_none_or_clear_bad(pmd))
  46                return NULL;
  47
  48        return pmd;
  49}
  50
  51static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
  52                            unsigned long addr)
  53{
  54        pgd_t *pgd;
  55        pud_t *pud;
  56        pmd_t *pmd;
  57
  58        pgd = pgd_offset(mm, addr);
  59        pud = pud_alloc(mm, pgd, addr);
  60        if (!pud)
  61                return NULL;
  62
  63        pmd = pmd_alloc(mm, pud, addr);
  64        if (!pmd)
  65                return NULL;
  66
  67        VM_BUG_ON(pmd_trans_huge(*pmd));
  68        if (pmd_none(*pmd) && __pte_alloc(mm, vma, pmd, addr))
  69                return NULL;
  70
  71        return pmd;
  72}
  73
  74static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
  75                unsigned long old_addr, unsigned long old_end,
  76                struct vm_area_struct *new_vma, pmd_t *new_pmd,
  77                unsigned long new_addr)
  78{
  79        struct address_space *mapping = NULL;
  80        struct mm_struct *mm = vma->vm_mm;
  81        pte_t *old_pte, *new_pte, pte;
  82        spinlock_t *old_ptl, *new_ptl;
  83        unsigned long old_start;
  84
  85        old_start = old_addr;
  86        mmu_notifier_invalidate_range_start(vma->vm_mm,
  87                                            old_start, old_end);
  88        if (vma->vm_file) {
  89                /*
  90                 * Subtle point from Rajesh Venkatasubramanian: before
  91                 * moving file-based ptes, we must lock truncate_pagecache
  92                 * out, since it might clean the dst vma before the src vma,
  93                 * and we propagate stale pages into the dst afterward.
  94                 */
  95                mapping = vma->vm_file->f_mapping;
  96                spin_lock(&mapping->i_mmap_lock);
  97                new_vma->vm_truncate_count = 0;
  98        }
  99
 100        /*
 101         * We don't have to worry about the ordering of src and dst
 102         * pte locks because exclusive mmap_sem prevents deadlock.
 103         */
 104        old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
 105        new_pte = pte_offset_map(new_pmd, new_addr);
 106        new_ptl = pte_lockptr(mm, new_pmd);
 107        if (new_ptl != old_ptl)
 108                spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
 109        arch_enter_lazy_mmu_mode();
 110
 111        for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
 112                                   new_pte++, new_addr += PAGE_SIZE) {
 113                if (pte_none(*old_pte))
 114                        continue;
 115                pte = ptep_clear_flush(vma, old_addr, old_pte);
 116                pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
 117                set_pte_at(mm, new_addr, new_pte, pte);
 118        }
 119
 120        arch_leave_lazy_mmu_mode();
 121        if (new_ptl != old_ptl)
 122                spin_unlock(new_ptl);
 123        pte_unmap(new_pte - 1);
 124        pte_unmap_unlock(old_pte - 1, old_ptl);
 125        if (mapping)
 126                spin_unlock(&mapping->i_mmap_lock);
 127        mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end);
 128}
 129
 130#define LATENCY_LIMIT   (64 * PAGE_SIZE)
 131
 132unsigned long move_page_tables(struct vm_area_struct *vma,
 133                unsigned long old_addr, struct vm_area_struct *new_vma,
 134                unsigned long new_addr, unsigned long len)
 135{
 136        unsigned long extent, next, old_end;
 137        pmd_t *old_pmd, *new_pmd;
 138
 139        old_end = old_addr + len;
 140        flush_cache_range(vma, old_addr, old_end);
 141
 142        for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
 143                cond_resched();
 144                next = (old_addr + PMD_SIZE) & PMD_MASK;
 145                if (next - 1 > old_end)
 146                        next = old_end;
 147                extent = next - old_addr;
 148                old_pmd = get_old_pmd(vma->vm_mm, old_addr);
 149                if (!old_pmd)
 150                        continue;
 151                new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
 152                if (!new_pmd)
 153                        break;
 154                next = (new_addr + PMD_SIZE) & PMD_MASK;
 155                if (extent > next - new_addr)
 156                        extent = next - new_addr;
 157                if (extent > LATENCY_LIMIT)
 158                        extent = LATENCY_LIMIT;
 159                move_ptes(vma, old_pmd, old_addr, old_addr + extent,
 160                                new_vma, new_pmd, new_addr);
 161        }
 162
 163        return len + old_addr - old_end;        /* how much done */
 164}
 165
 166static unsigned long move_vma(struct vm_area_struct *vma,
 167                unsigned long old_addr, unsigned long old_len,
 168                unsigned long new_len, unsigned long new_addr)
 169{
 170        struct mm_struct *mm = vma->vm_mm;
 171        struct vm_area_struct *new_vma;
 172        unsigned long vm_flags = vma->vm_flags;
 173        unsigned long new_pgoff;
 174        unsigned long moved_len;
 175        unsigned long excess = 0;
 176        unsigned long hiwater_vm;
 177        int split = 0;
 178        int err;
 179
 180        /*
 181         * We'd prefer to avoid failure later on in do_munmap:
 182         * which may split one vma into three before unmapping.
 183         */
 184        if (mm->map_count >= sysctl_max_map_count - 3)
 185                return -ENOMEM;
 186
 187        /*
 188         * Advise KSM to break any KSM pages in the area to be moved:
 189         * it would be confusing if they were to turn up at the new
 190         * location, where they happen to coincide with different KSM
 191         * pages recently unmapped.  But leave vma->vm_flags as it was,
 192         * so KSM can come around to merge on vma and new_vma afterwards.
 193         */
 194        err = ksm_madvise(vma, old_addr, old_addr + old_len,
 195                                                MADV_UNMERGEABLE, &vm_flags);
 196        if (err)
 197                return err;
 198
 199        new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
 200        new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff);
 201        if (!new_vma)
 202                return -ENOMEM;
 203
 204        moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len);
 205        if (moved_len < old_len) {
 206                /*
 207                 * On error, move entries back from new area to old,
 208                 * which will succeed since page tables still there,
 209                 * and then proceed to unmap new area instead of old.
 210                 */
 211                move_page_tables(new_vma, new_addr, vma, old_addr, moved_len);
 212                vma = new_vma;
 213                old_len = new_len;
 214                old_addr = new_addr;
 215                new_addr = -ENOMEM;
 216        }
 217
 218        /* Conceal VM_ACCOUNT so old reservation is not undone */
 219        if (vm_flags & VM_ACCOUNT) {
 220                vma->vm_flags &= ~VM_ACCOUNT;
 221                excess = vma->vm_end - vma->vm_start - old_len;
 222                if (old_addr > vma->vm_start &&
 223                    old_addr + old_len < vma->vm_end)
 224                        split = 1;
 225        }
 226
 227        /*
 228         * If we failed to move page tables we still do total_vm increment
 229         * since do_munmap() will decrement it by old_len == new_len.
 230         *
 231         * Since total_vm is about to be raised artificially high for a
 232         * moment, we need to restore high watermark afterwards: if stats
 233         * are taken meanwhile, total_vm and hiwater_vm appear too high.
 234         * If this were a serious issue, we'd add a flag to do_munmap().
 235         */
 236        hiwater_vm = mm->hiwater_vm;
 237        mm->total_vm += new_len >> PAGE_SHIFT;
 238        vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
 239
 240        if (do_munmap(mm, old_addr, old_len) < 0) {
 241                /* OOM: unable to split vma, just get accounts right */
 242                vm_unacct_memory(excess >> PAGE_SHIFT);
 243                excess = 0;
 244        }
 245        mm->hiwater_vm = hiwater_vm;
 246
 247        /* Restore VM_ACCOUNT if one or two pieces of vma left */
 248        if (excess) {
 249                vma->vm_flags |= VM_ACCOUNT;
 250                if (split)
 251                        vma->vm_next->vm_flags |= VM_ACCOUNT;
 252        }
 253
 254        if (vm_flags & VM_LOCKED) {
 255                mm->locked_vm += new_len >> PAGE_SHIFT;
 256                if (new_len > old_len)
 257                        mlock_vma_pages_range(new_vma, new_addr + old_len,
 258                                                       new_addr + new_len);
 259        }
 260
 261        return new_addr;
 262}
 263
 264static struct vm_area_struct *vma_to_resize(unsigned long addr,
 265        unsigned long old_len, unsigned long new_len, unsigned long *p)
 266{
 267        struct mm_struct *mm = current->mm;
 268        struct vm_area_struct *vma = find_vma(mm, addr);
 269
 270        if (!vma || vma->vm_start > addr)
 271                goto Efault;
 272
 273        if (is_vm_hugetlb_page(vma))
 274                goto Einval;
 275
 276        /* We can't remap across vm area boundaries */
 277        if (old_len > vma->vm_end - addr)
 278                goto Efault;
 279
 280        /* Need to be careful about a growing mapping */
 281        if (new_len > old_len) {
 282                unsigned long pgoff;
 283
 284                if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
 285                        goto Efault;
 286                pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
 287                pgoff += vma->vm_pgoff;
 288                if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
 289                        goto Einval;
 290        }
 291
 292        if (vma->vm_flags & VM_LOCKED) {
 293                unsigned long locked, lock_limit;
 294                locked = mm->locked_vm << PAGE_SHIFT;
 295                lock_limit = rlimit(RLIMIT_MEMLOCK);
 296                locked += new_len - old_len;
 297                if (locked > lock_limit && !capable(CAP_IPC_LOCK))
 298                        goto Eagain;
 299        }
 300
 301        if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT))
 302                goto Enomem;
 303
 304        if (vma->vm_flags & VM_ACCOUNT) {
 305                unsigned long charged = (new_len - old_len) >> PAGE_SHIFT;
 306                if (security_vm_enough_memory(charged))
 307                        goto Efault;
 308                *p = charged;
 309        }
 310
 311        return vma;
 312
 313Efault: /* very odd choice for most of the cases, but... */
 314        return ERR_PTR(-EFAULT);
 315Einval:
 316        return ERR_PTR(-EINVAL);
 317Enomem:
 318        return ERR_PTR(-ENOMEM);
 319Eagain:
 320        return ERR_PTR(-EAGAIN);
 321}
 322
 323static unsigned long mremap_to(unsigned long addr,
 324        unsigned long old_len, unsigned long new_addr,
 325        unsigned long new_len)
 326{
 327        struct mm_struct *mm = current->mm;
 328        struct vm_area_struct *vma;
 329        unsigned long ret = -EINVAL;
 330        unsigned long charged = 0;
 331        unsigned long map_flags;
 332
 333        if (new_addr & ~PAGE_MASK)
 334                goto out;
 335
 336        if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
 337                goto out;
 338
 339        /* Check if the location we're moving into overlaps the
 340         * old location at all, and fail if it does.
 341         */
 342        if ((new_addr <= addr) && (new_addr+new_len) > addr)
 343                goto out;
 344
 345        if ((addr <= new_addr) && (addr+old_len) > new_addr)
 346                goto out;
 347
 348        ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1);
 349        if (ret)
 350                goto out;
 351
 352        ret = do_munmap(mm, new_addr, new_len);
 353        if (ret)
 354                goto out;
 355
 356        if (old_len >= new_len) {
 357                ret = do_munmap(mm, addr+new_len, old_len - new_len);
 358                if (ret && old_len != new_len)
 359                        goto out;
 360                old_len = new_len;
 361        }
 362
 363        vma = vma_to_resize(addr, old_len, new_len, &charged);
 364        if (IS_ERR(vma)) {
 365                ret = PTR_ERR(vma);
 366                goto out;
 367        }
 368
 369        map_flags = MAP_FIXED;
 370        if (vma->vm_flags & VM_MAYSHARE)
 371                map_flags |= MAP_SHARED;
 372
 373        ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff +
 374                                ((addr - vma->vm_start) >> PAGE_SHIFT),
 375                                map_flags);
 376        if (ret & ~PAGE_MASK)
 377                goto out1;
 378
 379        ret = move_vma(vma, addr, old_len, new_len, new_addr);
 380        if (!(ret & ~PAGE_MASK))
 381                goto out;
 382out1:
 383        vm_unacct_memory(charged);
 384
 385out:
 386        return ret;
 387}
 388
 389static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
 390{
 391        unsigned long end = vma->vm_end + delta;
 392        if (end < vma->vm_end) /* overflow */
 393                return 0;
 394        if (vma->vm_next && vma->vm_next->vm_start < end) /* intersection */
 395                return 0;
 396        if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start,
 397                              0, MAP_FIXED) & ~PAGE_MASK)
 398                return 0;
 399        return 1;
 400}
 401
 402/*
 403 * Expand (or shrink) an existing mapping, potentially moving it at the
 404 * same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
 405 *
 406 * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
 407 * This option implies MREMAP_MAYMOVE.
 408 */
 409unsigned long do_mremap(unsigned long addr,
 410        unsigned long old_len, unsigned long new_len,
 411        unsigned long flags, unsigned long new_addr)
 412{
 413        struct mm_struct *mm = current->mm;
 414        struct vm_area_struct *vma;
 415        unsigned long ret = -EINVAL;
 416        unsigned long charged = 0;
 417
 418        if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
 419                goto out;
 420
 421        if (addr & ~PAGE_MASK)
 422                goto out;
 423
 424        old_len = PAGE_ALIGN(old_len);
 425        new_len = PAGE_ALIGN(new_len);
 426
 427        /*
 428         * We allow a zero old-len as a special case
 429         * for DOS-emu "duplicate shm area" thing. But
 430         * a zero new-len is nonsensical.
 431         */
 432        if (!new_len)
 433                goto out;
 434
 435        if (flags & MREMAP_FIXED) {
 436                if (flags & MREMAP_MAYMOVE)
 437                        ret = mremap_to(addr, old_len, new_addr, new_len);
 438                goto out;
 439        }
 440
 441        /*
 442         * Always allow a shrinking remap: that just unmaps
 443         * the unnecessary pages..
 444         * do_munmap does all the needed commit accounting
 445         */
 446        if (old_len >= new_len) {
 447                ret = do_munmap(mm, addr+new_len, old_len - new_len);
 448                if (ret && old_len != new_len)
 449                        goto out;
 450                ret = addr;
 451                goto out;
 452        }
 453
 454        /*
 455         * Ok, we need to grow..
 456         */
 457        vma = vma_to_resize(addr, old_len, new_len, &charged);
 458        if (IS_ERR(vma)) {
 459                ret = PTR_ERR(vma);
 460                goto out;
 461        }
 462
 463        /* old_len exactly to the end of the area..
 464         */
 465        if (old_len == vma->vm_end - addr) {
 466                /* can we just expand the current mapping? */
 467                if (vma_expandable(vma, new_len - old_len)) {
 468                        int pages = (new_len - old_len) >> PAGE_SHIFT;
 469
 470                        if (vma_adjust(vma, vma->vm_start, addr + new_len,
 471                                       vma->vm_pgoff, NULL)) {
 472                                ret = -ENOMEM;
 473                                goto out;
 474                        }
 475
 476                        mm->total_vm += pages;
 477                        vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
 478                        if (vma->vm_flags & VM_LOCKED) {
 479                                mm->locked_vm += pages;
 480                                mlock_vma_pages_range(vma, addr + old_len,
 481                                                   addr + new_len);
 482                        }
 483                        ret = addr;
 484                        goto out;
 485                }
 486        }
 487
 488        /*
 489         * We weren't able to just expand or shrink the area,
 490         * we need to create a new one and move it..
 491         */
 492        ret = -ENOMEM;
 493        if (flags & MREMAP_MAYMOVE) {
 494                unsigned long map_flags = 0;
 495                if (vma->vm_flags & VM_MAYSHARE)
 496                        map_flags |= MAP_SHARED;
 497
 498                new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
 499                                        vma->vm_pgoff +
 500                                        ((addr - vma->vm_start) >> PAGE_SHIFT),
 501                                        map_flags);
 502                if (new_addr & ~PAGE_MASK) {
 503                        ret = new_addr;
 504                        goto out;
 505                }
 506
 507                ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1);
 508                if (ret)
 509                        goto out;
 510                ret = move_vma(vma, addr, old_len, new_len, new_addr);
 511        }
 512out:
 513        if (ret & ~PAGE_MASK)
 514                vm_unacct_memory(charged);
 515        return ret;
 516}
 517
 518SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 519                unsigned long, new_len, unsigned long, flags,
 520                unsigned long, new_addr)
 521{
 522        unsigned long ret;
 523
 524        down_write(&current->mm->mmap_sem);
 525        ret = do_mremap(addr, old_len, new_len, flags, new_addr);
 526        up_write(&current->mm->mmap_sem);
 527        return ret;
 528}
 529