linux/mm/mremap.c
<<
>>
Prefs
   1/*
   2 *      mm/mremap.c
   3 *
   4 *      (C) Copyright 1996 Linus Torvalds
   5 *
   6 *      Address space accounting code   <alan@lxorguk.ukuu.org.uk>
   7 *      (C) Copyright 2002 Red Hat Inc, All Rights Reserved
   8 */
   9
  10#include <linux/mm.h>
  11#include <linux/hugetlb.h>
  12#include <linux/shm.h>
  13#include <linux/ksm.h>
  14#include <linux/mman.h>
  15#include <linux/swap.h>
  16#include <linux/capability.h>
  17#include <linux/fs.h>
  18#include <linux/highmem.h>
  19#include <linux/security.h>
  20#include <linux/syscalls.h>
  21#include <linux/mmu_notifier.h>
  22
  23#include <asm/uaccess.h>
  24#include <asm/cacheflush.h>
  25#include <asm/tlbflush.h>
  26
  27#include "internal.h"
  28
  29static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
  30{
  31        pgd_t *pgd;
  32        pud_t *pud;
  33        pmd_t *pmd;
  34
  35        pgd = pgd_offset(mm, addr);
  36        if (pgd_none_or_clear_bad(pgd))
  37                return NULL;
  38
  39        pud = pud_offset(pgd, addr);
  40        if (pud_none_or_clear_bad(pud))
  41                return NULL;
  42
  43        pmd = pmd_offset(pud, addr);
  44        split_huge_page_pmd(mm, pmd);
  45        if (pmd_none_or_clear_bad(pmd))
  46                return NULL;
  47
  48        return pmd;
  49}
  50
  51static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
  52                            unsigned long addr)
  53{
  54        pgd_t *pgd;
  55        pud_t *pud;
  56        pmd_t *pmd;
  57
  58        pgd = pgd_offset(mm, addr);
  59        pud = pud_alloc(mm, pgd, addr);
  60        if (!pud)
  61                return NULL;
  62
  63        pmd = pmd_alloc(mm, pud, addr);
  64        if (!pmd)
  65                return NULL;
  66
  67        VM_BUG_ON(pmd_trans_huge(*pmd));
  68        if (pmd_none(*pmd) && __pte_alloc(mm, vma, pmd, addr))
  69                return NULL;
  70
  71        return pmd;
  72}
  73
  74static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
  75                unsigned long old_addr, unsigned long old_end,
  76                struct vm_area_struct *new_vma, pmd_t *new_pmd,
  77                unsigned long new_addr)
  78{
  79        struct address_space *mapping = NULL;
  80        struct mm_struct *mm = vma->vm_mm;
  81        pte_t *old_pte, *new_pte, pte;
  82        spinlock_t *old_ptl, *new_ptl;
  83        unsigned long old_start;
  84
  85        old_start = old_addr;
  86        mmu_notifier_invalidate_range_start(vma->vm_mm,
  87                                            old_start, old_end);
  88        if (vma->vm_file) {
  89                /*
  90                 * Subtle point from Rajesh Venkatasubramanian: before
  91                 * moving file-based ptes, we must lock truncate_pagecache
  92                 * out, since it might clean the dst vma before the src vma,
  93                 * and we propagate stale pages into the dst afterward.
  94                 */
  95                mapping = vma->vm_file->f_mapping;
  96                spin_lock(&mapping->i_mmap_lock);
  97                new_vma->vm_truncate_count = 0;
  98        }
  99
 100        /*
 101         * We don't have to worry about the ordering of src and dst
 102         * pte locks because exclusive mmap_sem prevents deadlock.
 103         */
 104        old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
 105        new_pte = pte_offset_map(new_pmd, new_addr);
 106        new_ptl = pte_lockptr(mm, new_pmd);
 107        if (new_ptl != old_ptl)
 108                spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
 109        arch_enter_lazy_mmu_mode();
 110
 111        for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
 112                                   new_pte++, new_addr += PAGE_SIZE) {
 113                if (pte_none(*old_pte))
 114                        continue;
 115                pte = ptep_clear_flush(vma, old_addr, old_pte);
 116                pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
 117                set_pte_at(mm, new_addr, new_pte, pte);
 118        }
 119
 120        arch_leave_lazy_mmu_mode();
 121        if (new_ptl != old_ptl)
 122                spin_unlock(new_ptl);
 123        pte_unmap(new_pte - 1);
 124        pte_unmap_unlock(old_pte - 1, old_ptl);
 125        if (mapping)
 126                spin_unlock(&mapping->i_mmap_lock);
 127        mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end);
 128}
 129
 130#define LATENCY_LIMIT   (64 * PAGE_SIZE)
 131
 132unsigned long move_page_tables(struct vm_area_struct *vma,
 133                unsigned long old_addr, struct vm_area_struct *new_vma,
 134                unsigned long new_addr, unsigned long len)
 135{
 136        unsigned long extent, next, old_end;
 137        pmd_t *old_pmd, *new_pmd;
 138
 139        old_end = old_addr + len;
 140        flush_cache_range(vma, old_addr, old_end);
 141
 142        for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
 143                cond_resched();
 144                next = (old_addr + PMD_SIZE) & PMD_MASK;
 145                if (next - 1 > old_end)
 146                        next = old_end;
 147                extent = next - old_addr;
 148                old_pmd = get_old_pmd(vma->vm_mm, old_addr);
 149                if (!old_pmd)
 150                        continue;
 151                new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
 152                if (!new_pmd)
 153                        break;
 154                next = (new_addr + PMD_SIZE) & PMD_MASK;
 155                if (extent > next - new_addr)
 156                        extent = next - new_addr;
 157                if (extent > LATENCY_LIMIT)
 158                        extent = LATENCY_LIMIT;
 159                move_ptes(vma, old_pmd, old_addr, old_addr + extent,
 160                                new_vma, new_pmd, new_addr);
 161        }
 162
 163        return len + old_addr - old_end;        /* how much done */
 164}
 165
 166static unsigned long move_vma(struct vm_area_struct *vma,
 167                unsigned long old_addr, unsigned long old_len,
 168                unsigned long new_len, unsigned long new_addr)
 169{
 170        struct mm_struct *mm = vma->vm_mm;
 171        struct vm_area_struct *new_vma;
 172        unsigned long vm_flags = vma->vm_flags;
 173        unsigned long new_pgoff;
 174        unsigned long moved_len;
 175        unsigned long excess = 0;
 176        unsigned long hiwater_vm;
 177        int split = 0;
 178        int err;
 179
 180        /*
 181         * We'd prefer to avoid failure later on in do_munmap:
 182         * which may split one vma into three before unmapping.
 183         */
 184        if (mm->map_count >= sysctl_max_map_count - 3)
 185                return -ENOMEM;
 186
 187        /*
 188         * Advise KSM to break any KSM pages in the area to be moved:
 189         * it would be confusing if they were to turn up at the new
 190         * location, where they happen to coincide with different KSM
 191         * pages recently unmapped.  But leave vma->vm_flags as it was,
 192         * so KSM can come around to merge on vma and new_vma afterwards.
 193         */
 194        err = ksm_madvise(vma, old_addr, old_addr + old_len,
 195                                                MADV_UNMERGEABLE, &vm_flags);
 196        if (err)
 197                return err;
 198
 199        new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
 200        new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff);
 201        if (!new_vma)
 202                return -ENOMEM;
 203
 204        moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len);
 205        if (moved_len < old_len) {
 206                /*
 207                 * On error, move entries back from new area to old,
 208                 * which will succeed since page tables still there,
 209                 * and then proceed to unmap new area instead of old.
 210                 */
 211                move_page_tables(new_vma, new_addr, vma, old_addr, moved_len);
 212                vma = new_vma;
 213                old_len = new_len;
 214                old_addr = new_addr;
 215                new_addr = -ENOMEM;
 216        }
 217
 218        /* Conceal VM_ACCOUNT so old reservation is not undone */
 219        if (vm_flags & VM_ACCOUNT) {
 220                vma->vm_flags &= ~VM_ACCOUNT;
 221                excess = vma->vm_end - vma->vm_start - old_len;
 222                if (old_addr > vma->vm_start &&
 223                    old_addr + old_len < vma->vm_end)
 224                        split = 1;
 225        }
 226
 227        /*
 228         * If we failed to move page tables we still do total_vm increment
 229         * since do_munmap() will decrement it by old_len == new_len.
 230         *
 231         * Since total_vm is about to be raised artificially high for a
 232         * moment, we need to restore high watermark afterwards: if stats
 233         * are taken meanwhile, total_vm and hiwater_vm appear too high.
 234         * If this were a serious issue, we'd add a flag to do_munmap().
 235         */
 236        hiwater_vm = mm->hiwater_vm;
 237        mm->total_vm += new_len >> PAGE_SHIFT;
 238        vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
 239
 240        if (do_munmap(mm, old_addr, old_len) < 0) {
 241                /* OOM: unable to split vma, just get accounts right */
 242                vm_unacct_memory(excess >> PAGE_SHIFT);
 243                excess = 0;
 244        }
 245        mm->hiwater_vm = hiwater_vm;
 246
 247        /* Restore VM_ACCOUNT if one or two pieces of vma left */
 248        if (excess) {
 249                vma->vm_flags |= VM_ACCOUNT;
 250                if (split)
 251                        vma->vm_next->vm_flags |= VM_ACCOUNT;
 252        }
 253
 254        if (vm_flags & VM_LOCKED) {
 255                mm->locked_vm += new_len >> PAGE_SHIFT;
 256                if (new_len > old_len)
 257                        mlock_vma_pages_range(new_vma, new_addr + old_len,
 258                                                       new_addr + new_len);
 259        }
 260
 261        return new_addr;
 262}
 263
 264static struct vm_area_struct *vma_to_resize(unsigned long addr,
 265        unsigned long old_len, unsigned long new_len, unsigned long *p)
 266{
 267        struct mm_struct *mm = current->mm;
 268        struct vm_area_struct *vma = find_vma(mm, addr);
 269
 270        if (!vma || vma->vm_start > addr)
 271                goto Efault;
 272
 273        if (is_vm_hugetlb_page(vma))
 274                goto Einval;
 275
 276        /* We can't remap across vm area boundaries */
 277        if (old_len > vma->vm_end - addr)
 278                goto Efault;
 279
 280        if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) {
 281                if (new_len > old_len)
 282                        goto Efault;
 283        }
 284
 285        if (vma->vm_flags & VM_LOCKED) {
 286                unsigned long locked, lock_limit;
 287                locked = mm->locked_vm << PAGE_SHIFT;
 288                lock_limit = rlimit(RLIMIT_MEMLOCK);
 289                locked += new_len - old_len;
 290                if (locked > lock_limit && !capable(CAP_IPC_LOCK))
 291                        goto Eagain;
 292        }
 293
 294        if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT))
 295                goto Enomem;
 296
 297        if (vma->vm_flags & VM_ACCOUNT) {
 298                unsigned long charged = (new_len - old_len) >> PAGE_SHIFT;
 299                if (security_vm_enough_memory(charged))
 300                        goto Efault;
 301                *p = charged;
 302        }
 303
 304        return vma;
 305
 306Efault: /* very odd choice for most of the cases, but... */
 307        return ERR_PTR(-EFAULT);
 308Einval:
 309        return ERR_PTR(-EINVAL);
 310Enomem:
 311        return ERR_PTR(-ENOMEM);
 312Eagain:
 313        return ERR_PTR(-EAGAIN);
 314}
 315
 316static unsigned long mremap_to(unsigned long addr,
 317        unsigned long old_len, unsigned long new_addr,
 318        unsigned long new_len)
 319{
 320        struct mm_struct *mm = current->mm;
 321        struct vm_area_struct *vma;
 322        unsigned long ret = -EINVAL;
 323        unsigned long charged = 0;
 324        unsigned long map_flags;
 325
 326        if (new_addr & ~PAGE_MASK)
 327                goto out;
 328
 329        if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
 330                goto out;
 331
 332        /* Check if the location we're moving into overlaps the
 333         * old location at all, and fail if it does.
 334         */
 335        if ((new_addr <= addr) && (new_addr+new_len) > addr)
 336                goto out;
 337
 338        if ((addr <= new_addr) && (addr+old_len) > new_addr)
 339                goto out;
 340
 341        ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1);
 342        if (ret)
 343                goto out;
 344
 345        ret = do_munmap(mm, new_addr, new_len);
 346        if (ret)
 347                goto out;
 348
 349        if (old_len >= new_len) {
 350                ret = do_munmap(mm, addr+new_len, old_len - new_len);
 351                if (ret && old_len != new_len)
 352                        goto out;
 353                old_len = new_len;
 354        }
 355
 356        vma = vma_to_resize(addr, old_len, new_len, &charged);
 357        if (IS_ERR(vma)) {
 358                ret = PTR_ERR(vma);
 359                goto out;
 360        }
 361
 362        map_flags = MAP_FIXED;
 363        if (vma->vm_flags & VM_MAYSHARE)
 364                map_flags |= MAP_SHARED;
 365
 366        ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff +
 367                                ((addr - vma->vm_start) >> PAGE_SHIFT),
 368                                map_flags);
 369        if (ret & ~PAGE_MASK)
 370                goto out1;
 371
 372        ret = move_vma(vma, addr, old_len, new_len, new_addr);
 373        if (!(ret & ~PAGE_MASK))
 374                goto out;
 375out1:
 376        vm_unacct_memory(charged);
 377
 378out:
 379        return ret;
 380}
 381
 382static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
 383{
 384        unsigned long end = vma->vm_end + delta;
 385        if (end < vma->vm_end) /* overflow */
 386                return 0;
 387        if (vma->vm_next && vma->vm_next->vm_start < end) /* intersection */
 388                return 0;
 389        if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start,
 390                              0, MAP_FIXED) & ~PAGE_MASK)
 391                return 0;
 392        return 1;
 393}
 394
 395/*
 396 * Expand (or shrink) an existing mapping, potentially moving it at the
 397 * same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
 398 *
 399 * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
 400 * This option implies MREMAP_MAYMOVE.
 401 */
 402unsigned long do_mremap(unsigned long addr,
 403        unsigned long old_len, unsigned long new_len,
 404        unsigned long flags, unsigned long new_addr)
 405{
 406        struct mm_struct *mm = current->mm;
 407        struct vm_area_struct *vma;
 408        unsigned long ret = -EINVAL;
 409        unsigned long charged = 0;
 410
 411        if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
 412                goto out;
 413
 414        if (addr & ~PAGE_MASK)
 415                goto out;
 416
 417        old_len = PAGE_ALIGN(old_len);
 418        new_len = PAGE_ALIGN(new_len);
 419
 420        /*
 421         * We allow a zero old-len as a special case
 422         * for DOS-emu "duplicate shm area" thing. But
 423         * a zero new-len is nonsensical.
 424         */
 425        if (!new_len)
 426                goto out;
 427
 428        if (flags & MREMAP_FIXED) {
 429                if (flags & MREMAP_MAYMOVE)
 430                        ret = mremap_to(addr, old_len, new_addr, new_len);
 431                goto out;
 432        }
 433
 434        /*
 435         * Always allow a shrinking remap: that just unmaps
 436         * the unnecessary pages..
 437         * do_munmap does all the needed commit accounting
 438         */
 439        if (old_len >= new_len) {
 440                ret = do_munmap(mm, addr+new_len, old_len - new_len);
 441                if (ret && old_len != new_len)
 442                        goto out;
 443                ret = addr;
 444                goto out;
 445        }
 446
 447        /*
 448         * Ok, we need to grow..
 449         */
 450        vma = vma_to_resize(addr, old_len, new_len, &charged);
 451        if (IS_ERR(vma)) {
 452                ret = PTR_ERR(vma);
 453                goto out;
 454        }
 455
 456        /* old_len exactly to the end of the area..
 457         */
 458        if (old_len == vma->vm_end - addr) {
 459                /* can we just expand the current mapping? */
 460                if (vma_expandable(vma, new_len - old_len)) {
 461                        int pages = (new_len - old_len) >> PAGE_SHIFT;
 462
 463                        if (vma_adjust(vma, vma->vm_start, addr + new_len,
 464                                       vma->vm_pgoff, NULL)) {
 465                                ret = -ENOMEM;
 466                                goto out;
 467                        }
 468
 469                        mm->total_vm += pages;
 470                        vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
 471                        if (vma->vm_flags & VM_LOCKED) {
 472                                mm->locked_vm += pages;
 473                                mlock_vma_pages_range(vma, addr + old_len,
 474                                                   addr + new_len);
 475                        }
 476                        ret = addr;
 477                        goto out;
 478                }
 479        }
 480
 481        /*
 482         * We weren't able to just expand or shrink the area,
 483         * we need to create a new one and move it..
 484         */
 485        ret = -ENOMEM;
 486        if (flags & MREMAP_MAYMOVE) {
 487                unsigned long map_flags = 0;
 488                if (vma->vm_flags & VM_MAYSHARE)
 489                        map_flags |= MAP_SHARED;
 490
 491                new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
 492                                        vma->vm_pgoff +
 493                                        ((addr - vma->vm_start) >> PAGE_SHIFT),
 494                                        map_flags);
 495                if (new_addr & ~PAGE_MASK) {
 496                        ret = new_addr;
 497                        goto out;
 498                }
 499
 500                ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1);
 501                if (ret)
 502                        goto out;
 503                ret = move_vma(vma, addr, old_len, new_len, new_addr);
 504        }
 505out:
 506        if (ret & ~PAGE_MASK)
 507                vm_unacct_memory(charged);
 508        return ret;
 509}
 510
 511SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 512                unsigned long, new_len, unsigned long, flags,
 513                unsigned long, new_addr)
 514{
 515        unsigned long ret;
 516
 517        down_write(&current->mm->mmap_sem);
 518        ret = do_mremap(addr, old_len, new_len, flags, new_addr);
 519        up_write(&current->mm->mmap_sem);
 520        return ret;
 521}
 522