linux/mm/madvise.c
<<
>>
Prefs
   1/*
   2 *      linux/mm/madvise.c
   3 *
   4 * Copyright (C) 1999  Linus Torvalds
   5 * Copyright (C) 2002  Christoph Hellwig
   6 */
   7
   8#include <linux/mman.h>
   9#include <linux/pagemap.h>
  10#include <linux/syscalls.h>
  11#include <linux/mempolicy.h>
  12#include <linux/hugetlb.h>
  13#include <linux/sched.h>
  14#include <linux/ksm.h>
  15
  16/*
  17 * Any behaviour which results in changes to the vma->vm_flags needs to
  18 * take mmap_sem for writing. Others, which simply traverse vmas, need
  19 * to only take it for reading.
  20 */
  21static int madvise_need_mmap_write(int behavior)
  22{
  23        switch (behavior) {
  24        case MADV_REMOVE:
  25        case MADV_WILLNEED:
  26        case MADV_DONTNEED:
  27                return 0;
  28        default:
  29                /* be safe, default to 1. list exceptions explicitly */
  30                return 1;
  31        }
  32}
  33
  34/*
  35 * We can potentially split a vm area into separate
  36 * areas, each area with its own behavior.
  37 */
  38static long madvise_behavior(struct vm_area_struct * vma,
  39                     struct vm_area_struct **prev,
  40                     unsigned long start, unsigned long end, int behavior)
  41{
  42        struct mm_struct * mm = vma->vm_mm;
  43        int error = 0;
  44        pgoff_t pgoff;
  45        unsigned long new_flags = vma->vm_flags;
  46
  47        switch (behavior) {
  48        case MADV_NORMAL:
  49                new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
  50                break;
  51        case MADV_SEQUENTIAL:
  52                new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
  53                break;
  54        case MADV_RANDOM:
  55                new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
  56                break;
  57        case MADV_DONTFORK:
  58                new_flags |= VM_DONTCOPY;
  59                break;
  60        case MADV_DOFORK:
  61                if (vma->vm_flags & VM_IO) {
  62                        error = -EINVAL;
  63                        goto out;
  64                }
  65                new_flags &= ~VM_DONTCOPY;
  66                break;
  67        case MADV_MERGEABLE:
  68        case MADV_UNMERGEABLE:
  69                error = ksm_madvise(vma, start, end, behavior, &new_flags);
  70                if (error)
  71                        goto out;
  72                break;
  73        }
  74
  75        if (new_flags == vma->vm_flags) {
  76                *prev = vma;
  77                goto out;
  78        }
  79
  80        pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
  81        *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
  82                                vma->vm_file, pgoff, vma_policy(vma));
  83        if (*prev) {
  84                vma = *prev;
  85                goto success;
  86        }
  87
  88        *prev = vma;
  89
  90        if (start != vma->vm_start) {
  91                error = split_vma(mm, vma, start, 1);
  92                if (error)
  93                        goto out;
  94        }
  95
  96        if (end != vma->vm_end) {
  97                error = split_vma(mm, vma, end, 0);
  98                if (error)
  99                        goto out;
 100        }
 101
 102success:
 103        /*
 104         * vm_flags is protected by the mmap_sem held in write mode.
 105         */
 106        vma->vm_flags = new_flags;
 107
 108out:
 109        if (error == -ENOMEM)
 110                error = -EAGAIN;
 111        return error;
 112}
 113
 114/*
 115 * Schedule all required I/O operations.  Do not wait for completion.
 116 */
 117static long madvise_willneed(struct vm_area_struct * vma,
 118                             struct vm_area_struct ** prev,
 119                             unsigned long start, unsigned long end)
 120{
 121        struct file *file = vma->vm_file;
 122
 123        if (!file)
 124                return -EBADF;
 125
 126        if (file->f_mapping->a_ops->get_xip_mem) {
 127                /* no bad return value, but ignore advice */
 128                return 0;
 129        }
 130
 131        *prev = vma;
 132        start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
 133        if (end > vma->vm_end)
 134                end = vma->vm_end;
 135        end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
 136
 137        force_page_cache_readahead(file->f_mapping, file, start, end - start);
 138        return 0;
 139}
 140
 141/*
 142 * Application no longer needs these pages.  If the pages are dirty,
 143 * it's OK to just throw them away.  The app will be more careful about
 144 * data it wants to keep.  Be sure to free swap resources too.  The
 145 * zap_page_range call sets things up for shrink_active_list to actually free
 146 * these pages later if no one else has touched them in the meantime,
 147 * although we could add these pages to a global reuse list for
 148 * shrink_active_list to pick up before reclaiming other pages.
 149 *
 150 * NB: This interface discards data rather than pushes it out to swap,
 151 * as some implementations do.  This has performance implications for
 152 * applications like large transactional databases which want to discard
 153 * pages in anonymous maps after committing to backing store the data
 154 * that was kept in them.  There is no reason to write this data out to
 155 * the swap area if the application is discarding it.
 156 *
 157 * An interface that causes the system to free clean pages and flush
 158 * dirty pages is already available as msync(MS_INVALIDATE).
 159 */
 160static long madvise_dontneed(struct vm_area_struct * vma,
 161                             struct vm_area_struct ** prev,
 162                             unsigned long start, unsigned long end)
 163{
 164        *prev = vma;
 165        if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
 166                return -EINVAL;
 167
 168        if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
 169                struct zap_details details = {
 170                        .nonlinear_vma = vma,
 171                        .last_index = ULONG_MAX,
 172                };
 173                zap_page_range(vma, start, end - start, &details);
 174        } else
 175                zap_page_range(vma, start, end - start, NULL);
 176        return 0;
 177}
 178
 179/*
 180 * Application wants to free up the pages and associated backing store.
 181 * This is effectively punching a hole into the middle of a file.
 182 *
 183 * NOTE: Currently, only shmfs/tmpfs is supported for this operation.
 184 * Other filesystems return -ENOSYS.
 185 */
 186static long madvise_remove(struct vm_area_struct *vma,
 187                                struct vm_area_struct **prev,
 188                                unsigned long start, unsigned long end)
 189{
 190        struct address_space *mapping;
 191        loff_t offset, endoff;
 192        int error;
 193
 194        *prev = NULL;   /* tell sys_madvise we drop mmap_sem */
 195
 196        if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB))
 197                return -EINVAL;
 198
 199        if (!vma->vm_file || !vma->vm_file->f_mapping
 200                || !vma->vm_file->f_mapping->host) {
 201                        return -EINVAL;
 202        }
 203
 204        if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
 205                return -EACCES;
 206
 207        mapping = vma->vm_file->f_mapping;
 208
 209        offset = (loff_t)(start - vma->vm_start)
 210                        + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
 211        endoff = (loff_t)(end - vma->vm_start - 1)
 212                        + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
 213
 214        /* vmtruncate_range needs to take i_mutex and i_alloc_sem */
 215        up_read(&current->mm->mmap_sem);
 216        error = vmtruncate_range(mapping->host, offset, endoff);
 217        down_read(&current->mm->mmap_sem);
 218        return error;
 219}
 220
 221#ifdef CONFIG_MEMORY_FAILURE
 222/*
 223 * Error injection support for memory error handling.
 224 */
 225static int madvise_hwpoison(unsigned long start, unsigned long end)
 226{
 227        int ret = 0;
 228
 229        if (!capable(CAP_SYS_ADMIN))
 230                return -EPERM;
 231        for (; start < end; start += PAGE_SIZE) {
 232                struct page *p;
 233                int ret = get_user_pages(current, current->mm, start, 1,
 234                                                0, 0, &p, NULL);
 235                if (ret != 1)
 236                        return ret;
 237                printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n",
 238                       page_to_pfn(p), start);
 239                /* Ignore return value for now */
 240                __memory_failure(page_to_pfn(p), 0, 1);
 241                put_page(p);
 242        }
 243        return ret;
 244}
 245#endif
 246
 247static long
 248madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
 249                unsigned long start, unsigned long end, int behavior)
 250{
 251        switch (behavior) {
 252        case MADV_REMOVE:
 253                return madvise_remove(vma, prev, start, end);
 254        case MADV_WILLNEED:
 255                return madvise_willneed(vma, prev, start, end);
 256        case MADV_DONTNEED:
 257                return madvise_dontneed(vma, prev, start, end);
 258        default:
 259                return madvise_behavior(vma, prev, start, end, behavior);
 260        }
 261}
 262
 263static int
 264madvise_behavior_valid(int behavior)
 265{
 266        switch (behavior) {
 267        case MADV_DOFORK:
 268        case MADV_DONTFORK:
 269        case MADV_NORMAL:
 270        case MADV_SEQUENTIAL:
 271        case MADV_RANDOM:
 272        case MADV_REMOVE:
 273        case MADV_WILLNEED:
 274        case MADV_DONTNEED:
 275#ifdef CONFIG_KSM
 276        case MADV_MERGEABLE:
 277        case MADV_UNMERGEABLE:
 278#endif
 279                return 1;
 280
 281        default:
 282                return 0;
 283        }
 284}
 285
 286/*
 287 * The madvise(2) system call.
 288 *
 289 * Applications can use madvise() to advise the kernel how it should
 290 * handle paging I/O in this VM area.  The idea is to help the kernel
 291 * use appropriate read-ahead and caching techniques.  The information
 292 * provided is advisory only, and can be safely disregarded by the
 293 * kernel without affecting the correct operation of the application.
 294 *
 295 * behavior values:
 296 *  MADV_NORMAL - the default behavior is to read clusters.  This
 297 *              results in some read-ahead and read-behind.
 298 *  MADV_RANDOM - the system should read the minimum amount of data
 299 *              on any access, since it is unlikely that the appli-
 300 *              cation will need more than what it asks for.
 301 *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
 302 *              once, so they can be aggressively read ahead, and
 303 *              can be freed soon after they are accessed.
 304 *  MADV_WILLNEED - the application is notifying the system to read
 305 *              some pages ahead.
 306 *  MADV_DONTNEED - the application is finished with the given range,
 307 *              so the kernel can free resources associated with it.
 308 *  MADV_REMOVE - the application wants to free up the given range of
 309 *              pages and associated backing store.
 310 *  MADV_DONTFORK - omit this area from child's address space when forking:
 311 *              typically, to avoid COWing pages pinned by get_user_pages().
 312 *  MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
 313 *  MADV_MERGEABLE - the application recommends that KSM try to merge pages in
 314 *              this area with pages of identical content from other such areas.
 315 *  MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
 316 *
 317 * return values:
 318 *  zero    - success
 319 *  -EINVAL - start + len < 0, start is not page-aligned,
 320 *              "behavior" is not a valid value, or application
 321 *              is attempting to release locked or shared pages.
 322 *  -ENOMEM - addresses in the specified range are not currently
 323 *              mapped, or are outside the AS of the process.
 324 *  -EIO    - an I/O error occurred while paging in data.
 325 *  -EBADF  - map exists, but area maps something that isn't a file.
 326 *  -EAGAIN - a kernel resource was temporarily unavailable.
 327 */
 328SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
 329{
 330        unsigned long end, tmp;
 331        struct vm_area_struct * vma, *prev;
 332        int unmapped_error = 0;
 333        int error = -EINVAL;
 334        int write;
 335        size_t len;
 336
 337#ifdef CONFIG_MEMORY_FAILURE
 338        if (behavior == MADV_HWPOISON)
 339                return madvise_hwpoison(start, start+len_in);
 340#endif
 341        if (!madvise_behavior_valid(behavior))
 342                return error;
 343
 344        write = madvise_need_mmap_write(behavior);
 345        if (write)
 346                down_write(&current->mm->mmap_sem);
 347        else
 348                down_read(&current->mm->mmap_sem);
 349
 350        if (start & ~PAGE_MASK)
 351                goto out;
 352        len = (len_in + ~PAGE_MASK) & PAGE_MASK;
 353
 354        /* Check to see whether len was rounded up from small -ve to zero */
 355        if (len_in && !len)
 356                goto out;
 357
 358        end = start + len;
 359        if (end < start)
 360                goto out;
 361
 362        error = 0;
 363        if (end == start)
 364                goto out;
 365
 366        /*
 367         * If the interval [start,end) covers some unmapped address
 368         * ranges, just ignore them, but return -ENOMEM at the end.
 369         * - different from the way of handling in mlock etc.
 370         */
 371        vma = find_vma_prev(current->mm, start, &prev);
 372        if (vma && start > vma->vm_start)
 373                prev = vma;
 374
 375        for (;;) {
 376                /* Still start < end. */
 377                error = -ENOMEM;
 378                if (!vma)
 379                        goto out;
 380
 381                /* Here start < (end|vma->vm_end). */
 382                if (start < vma->vm_start) {
 383                        unmapped_error = -ENOMEM;
 384                        start = vma->vm_start;
 385                        if (start >= end)
 386                                goto out;
 387                }
 388
 389                /* Here vma->vm_start <= start < (end|vma->vm_end) */
 390                tmp = vma->vm_end;
 391                if (end < tmp)
 392                        tmp = end;
 393
 394                /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
 395                error = madvise_vma(vma, &prev, start, tmp, behavior);
 396                if (error)
 397                        goto out;
 398                start = tmp;
 399                if (prev && start < prev->vm_end)
 400                        start = prev->vm_end;
 401                error = unmapped_error;
 402                if (start >= end)
 403                        goto out;
 404                if (prev)
 405                        vma = prev->vm_next;
 406                else    /* madvise_remove dropped mmap_sem */
 407                        vma = find_vma(current->mm, start);
 408        }
 409out:
 410        if (write)
 411                up_write(&current->mm->mmap_sem);
 412        else
 413                up_read(&current->mm->mmap_sem);
 414
 415        return error;
 416}
 417