linux/mm/madvise.c
<<
>>
Prefs
   1/*
   2 *      linux/mm/madvise.c
   3 *
   4 * Copyright (C) 1999  Linus Torvalds
   5 * Copyright (C) 2002  Christoph Hellwig
   6 */
   7
   8#include <linux/mman.h>
   9#include <linux/pagemap.h>
  10#include <linux/syscalls.h>
  11#include <linux/mempolicy.h>
  12#include <linux/page-isolation.h>
  13#include <linux/hugetlb.h>
  14#include <linux/falloc.h>
  15#include <linux/sched.h>
  16#include <linux/ksm.h>
  17#include <linux/fs.h>
  18#include <linux/file.h>
  19#include <linux/blkdev.h>
  20#include <linux/backing-dev.h>
  21#include <linux/swap.h>
  22#include <linux/swapops.h>
  23#include <linux/mmu_notifier.h>
  24
  25#include <asm/tlb.h>
  26
  27/*
  28 * Any behaviour which results in changes to the vma->vm_flags needs to
  29 * take mmap_sem for writing. Others, which simply traverse vmas, need
  30 * to only take it for reading.
  31 */
  32static int madvise_need_mmap_write(int behavior)
  33{
  34        switch (behavior) {
  35        case MADV_REMOVE:
  36        case MADV_WILLNEED:
  37        case MADV_DONTNEED:
  38        case MADV_FREE:
  39                return 0;
  40        default:
  41                /* be safe, default to 1. list exceptions explicitly */
  42                return 1;
  43        }
  44}
  45
  46/*
  47 * We can potentially split a vm area into separate
  48 * areas, each area with its own behavior.
  49 */
  50static long madvise_behavior(struct vm_area_struct *vma,
  51                     struct vm_area_struct **prev,
  52                     unsigned long start, unsigned long end, int behavior)
  53{
  54        struct mm_struct *mm = vma->vm_mm;
  55        int error = 0;
  56        pgoff_t pgoff;
  57        unsigned long new_flags = vma->vm_flags;
  58
  59        switch (behavior) {
  60        case MADV_NORMAL:
  61                new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
  62                break;
  63        case MADV_SEQUENTIAL:
  64                new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
  65                break;
  66        case MADV_RANDOM:
  67                new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
  68                break;
  69        case MADV_DONTFORK:
  70                new_flags |= VM_DONTCOPY;
  71                break;
  72        case MADV_DOFORK:
  73                if (vma->vm_flags & VM_IO) {
  74                        error = -EINVAL;
  75                        goto out;
  76                }
  77                new_flags &= ~VM_DONTCOPY;
  78                break;
  79        case MADV_DONTDUMP:
  80                new_flags |= VM_DONTDUMP;
  81                break;
  82        case MADV_DODUMP:
  83                if (new_flags & VM_SPECIAL) {
  84                        error = -EINVAL;
  85                        goto out;
  86                }
  87                new_flags &= ~VM_DONTDUMP;
  88                break;
  89        case MADV_MERGEABLE:
  90        case MADV_UNMERGEABLE:
  91                error = ksm_madvise(vma, start, end, behavior, &new_flags);
  92                if (error)
  93                        goto out;
  94                break;
  95        case MADV_HUGEPAGE:
  96        case MADV_NOHUGEPAGE:
  97                error = hugepage_madvise(vma, &new_flags, behavior);
  98                if (error)
  99                        goto out;
 100                break;
 101        }
 102
 103        if (new_flags == vma->vm_flags) {
 104                *prev = vma;
 105                goto out;
 106        }
 107
 108        pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
 109        *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
 110                          vma->vm_file, pgoff, vma_policy(vma),
 111                          vma->vm_userfaultfd_ctx);
 112        if (*prev) {
 113                vma = *prev;
 114                goto success;
 115        }
 116
 117        *prev = vma;
 118
 119        if (start != vma->vm_start) {
 120                error = split_vma(mm, vma, start, 1);
 121                if (error)
 122                        goto out;
 123        }
 124
 125        if (end != vma->vm_end) {
 126                error = split_vma(mm, vma, end, 0);
 127                if (error)
 128                        goto out;
 129        }
 130
 131success:
 132        /*
 133         * vm_flags is protected by the mmap_sem held in write mode.
 134         */
 135        vma->vm_flags = new_flags;
 136
 137out:
 138        if (error == -ENOMEM)
 139                error = -EAGAIN;
 140        return error;
 141}
 142
 143#ifdef CONFIG_SWAP
 144static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
 145        unsigned long end, struct mm_walk *walk)
 146{
 147        pte_t *orig_pte;
 148        struct vm_area_struct *vma = walk->private;
 149        unsigned long index;
 150
 151        if (pmd_none_or_trans_huge_or_clear_bad(pmd))
 152                return 0;
 153
 154        for (index = start; index != end; index += PAGE_SIZE) {
 155                pte_t pte;
 156                swp_entry_t entry;
 157                struct page *page;
 158                spinlock_t *ptl;
 159
 160                orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
 161                pte = *(orig_pte + ((index - start) / PAGE_SIZE));
 162                pte_unmap_unlock(orig_pte, ptl);
 163
 164                if (pte_present(pte) || pte_none(pte))
 165                        continue;
 166                entry = pte_to_swp_entry(pte);
 167                if (unlikely(non_swap_entry(entry)))
 168                        continue;
 169
 170                page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
 171                                                                vma, index);
 172                if (page)
 173                        put_page(page);
 174        }
 175
 176        return 0;
 177}
 178
 179static void force_swapin_readahead(struct vm_area_struct *vma,
 180                unsigned long start, unsigned long end)
 181{
 182        struct mm_walk walk = {
 183                .mm = vma->vm_mm,
 184                .pmd_entry = swapin_walk_pmd_entry,
 185                .private = vma,
 186        };
 187
 188        walk_page_range(start, end, &walk);
 189
 190        lru_add_drain();        /* Push any new pages onto the LRU now */
 191}
 192
 193static void force_shm_swapin_readahead(struct vm_area_struct *vma,
 194                unsigned long start, unsigned long end,
 195                struct address_space *mapping)
 196{
 197        pgoff_t index;
 198        struct page *page;
 199        swp_entry_t swap;
 200
 201        for (; start < end; start += PAGE_SIZE) {
 202                index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
 203
 204                page = find_get_entry(mapping, index);
 205                if (!radix_tree_exceptional_entry(page)) {
 206                        if (page)
 207                                put_page(page);
 208                        continue;
 209                }
 210                swap = radix_to_swp_entry(page);
 211                page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
 212                                                                NULL, 0);
 213                if (page)
 214                        put_page(page);
 215        }
 216
 217        lru_add_drain();        /* Push any new pages onto the LRU now */
 218}
 219#endif          /* CONFIG_SWAP */
 220
 221/*
 222 * Schedule all required I/O operations.  Do not wait for completion.
 223 */
 224static long madvise_willneed(struct vm_area_struct *vma,
 225                             struct vm_area_struct **prev,
 226                             unsigned long start, unsigned long end)
 227{
 228        struct file *file = vma->vm_file;
 229
 230#ifdef CONFIG_SWAP
 231        if (!file) {
 232                *prev = vma;
 233                force_swapin_readahead(vma, start, end);
 234                return 0;
 235        }
 236
 237        if (shmem_mapping(file->f_mapping)) {
 238                *prev = vma;
 239                force_shm_swapin_readahead(vma, start, end,
 240                                        file->f_mapping);
 241                return 0;
 242        }
 243#else
 244        if (!file)
 245                return -EBADF;
 246#endif
 247
 248        if (IS_DAX(file_inode(file))) {
 249                /* no bad return value, but ignore advice */
 250                return 0;
 251        }
 252
 253        *prev = vma;
 254        start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
 255        if (end > vma->vm_end)
 256                end = vma->vm_end;
 257        end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
 258
 259        force_page_cache_readahead(file->f_mapping, file, start, end - start);
 260        return 0;
 261}
 262
 263static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 264                                unsigned long end, struct mm_walk *walk)
 265
 266{
 267        struct mmu_gather *tlb = walk->private;
 268        struct mm_struct *mm = tlb->mm;
 269        struct vm_area_struct *vma = walk->vma;
 270        spinlock_t *ptl;
 271        pte_t *orig_pte, *pte, ptent;
 272        struct page *page;
 273        int nr_swap = 0;
 274        unsigned long next;
 275
 276        next = pmd_addr_end(addr, end);
 277        if (pmd_trans_huge(*pmd))
 278                if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
 279                        goto next;
 280
 281        if (pmd_trans_unstable(pmd))
 282                return 0;
 283
 284        orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 285        arch_enter_lazy_mmu_mode();
 286        for (; addr != end; pte++, addr += PAGE_SIZE) {
 287                ptent = *pte;
 288
 289                if (pte_none(ptent))
 290                        continue;
 291                /*
 292                 * If the pte has swp_entry, just clear page table to
 293                 * prevent swap-in which is more expensive rather than
 294                 * (page allocation + zeroing).
 295                 */
 296                if (!pte_present(ptent)) {
 297                        swp_entry_t entry;
 298
 299                        entry = pte_to_swp_entry(ptent);
 300                        if (non_swap_entry(entry))
 301                                continue;
 302                        nr_swap--;
 303                        free_swap_and_cache(entry);
 304                        pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
 305                        continue;
 306                }
 307
 308                page = vm_normal_page(vma, addr, ptent);
 309                if (!page)
 310                        continue;
 311
 312                /*
 313                 * If pmd isn't transhuge but the page is THP and
 314                 * is owned by only this process, split it and
 315                 * deactivate all pages.
 316                 */
 317                if (PageTransCompound(page)) {
 318                        if (page_mapcount(page) != 1)
 319                                goto out;
 320                        get_page(page);
 321                        if (!trylock_page(page)) {
 322                                put_page(page);
 323                                goto out;
 324                        }
 325                        pte_unmap_unlock(orig_pte, ptl);
 326                        if (split_huge_page(page)) {
 327                                unlock_page(page);
 328                                put_page(page);
 329                                pte_offset_map_lock(mm, pmd, addr, &ptl);
 330                                goto out;
 331                        }
 332                        put_page(page);
 333                        unlock_page(page);
 334                        pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 335                        pte--;
 336                        addr -= PAGE_SIZE;
 337                        continue;
 338                }
 339
 340                VM_BUG_ON_PAGE(PageTransCompound(page), page);
 341
 342                if (PageSwapCache(page) || PageDirty(page)) {
 343                        if (!trylock_page(page))
 344                                continue;
 345                        /*
 346                         * If page is shared with others, we couldn't clear
 347                         * PG_dirty of the page.
 348                         */
 349                        if (page_mapcount(page) != 1) {
 350                                unlock_page(page);
 351                                continue;
 352                        }
 353
 354                        if (PageSwapCache(page) && !try_to_free_swap(page)) {
 355                                unlock_page(page);
 356                                continue;
 357                        }
 358
 359                        ClearPageDirty(page);
 360                        unlock_page(page);
 361                }
 362
 363                if (pte_young(ptent) || pte_dirty(ptent)) {
 364                        /*
 365                         * Some of architecture(ex, PPC) don't update TLB
 366                         * with set_pte_at and tlb_remove_tlb_entry so for
 367                         * the portability, remap the pte with old|clean
 368                         * after pte clearing.
 369                         */
 370                        ptent = ptep_get_and_clear_full(mm, addr, pte,
 371                                                        tlb->fullmm);
 372
 373                        ptent = pte_mkold(ptent);
 374                        ptent = pte_mkclean(ptent);
 375                        set_pte_at(mm, addr, pte, ptent);
 376                        if (PageActive(page))
 377                                deactivate_page(page);
 378                        tlb_remove_tlb_entry(tlb, pte, addr);
 379                }
 380        }
 381out:
 382        if (nr_swap) {
 383                if (current->mm == mm)
 384                        sync_mm_rss(mm);
 385
 386                add_mm_counter(mm, MM_SWAPENTS, nr_swap);
 387        }
 388        arch_leave_lazy_mmu_mode();
 389        pte_unmap_unlock(orig_pte, ptl);
 390        cond_resched();
 391next:
 392        return 0;
 393}
 394
 395static void madvise_free_page_range(struct mmu_gather *tlb,
 396                             struct vm_area_struct *vma,
 397                             unsigned long addr, unsigned long end)
 398{
 399        struct mm_walk free_walk = {
 400                .pmd_entry = madvise_free_pte_range,
 401                .mm = vma->vm_mm,
 402                .private = tlb,
 403        };
 404
 405        tlb_start_vma(tlb, vma);
 406        walk_page_range(addr, end, &free_walk);
 407        tlb_end_vma(tlb, vma);
 408}
 409
 410static int madvise_free_single_vma(struct vm_area_struct *vma,
 411                        unsigned long start_addr, unsigned long end_addr)
 412{
 413        unsigned long start, end;
 414        struct mm_struct *mm = vma->vm_mm;
 415        struct mmu_gather tlb;
 416
 417        if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
 418                return -EINVAL;
 419
 420        /* MADV_FREE works for only anon vma at the moment */
 421        if (!vma_is_anonymous(vma))
 422                return -EINVAL;
 423
 424        start = max(vma->vm_start, start_addr);
 425        if (start >= vma->vm_end)
 426                return -EINVAL;
 427        end = min(vma->vm_end, end_addr);
 428        if (end <= vma->vm_start)
 429                return -EINVAL;
 430
 431        lru_add_drain();
 432        tlb_gather_mmu(&tlb, mm, start, end);
 433        update_hiwater_rss(mm);
 434
 435        mmu_notifier_invalidate_range_start(mm, start, end);
 436        madvise_free_page_range(&tlb, vma, start, end);
 437        mmu_notifier_invalidate_range_end(mm, start, end);
 438        tlb_finish_mmu(&tlb, start, end);
 439
 440        return 0;
 441}
 442
 443static long madvise_free(struct vm_area_struct *vma,
 444                             struct vm_area_struct **prev,
 445                             unsigned long start, unsigned long end)
 446{
 447        *prev = vma;
 448        return madvise_free_single_vma(vma, start, end);
 449}
 450
 451/*
 452 * Application no longer needs these pages.  If the pages are dirty,
 453 * it's OK to just throw them away.  The app will be more careful about
 454 * data it wants to keep.  Be sure to free swap resources too.  The
 455 * zap_page_range call sets things up for shrink_active_list to actually free
 456 * these pages later if no one else has touched them in the meantime,
 457 * although we could add these pages to a global reuse list for
 458 * shrink_active_list to pick up before reclaiming other pages.
 459 *
 460 * NB: This interface discards data rather than pushes it out to swap,
 461 * as some implementations do.  This has performance implications for
 462 * applications like large transactional databases which want to discard
 463 * pages in anonymous maps after committing to backing store the data
 464 * that was kept in them.  There is no reason to write this data out to
 465 * the swap area if the application is discarding it.
 466 *
 467 * An interface that causes the system to free clean pages and flush
 468 * dirty pages is already available as msync(MS_INVALIDATE).
 469 */
 470static long madvise_dontneed(struct vm_area_struct *vma,
 471                             struct vm_area_struct **prev,
 472                             unsigned long start, unsigned long end)
 473{
 474        *prev = vma;
 475        if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
 476                return -EINVAL;
 477
 478        zap_page_range(vma, start, end - start, NULL);
 479        return 0;
 480}
 481
 482/*
 483 * Application wants to free up the pages and associated backing store.
 484 * This is effectively punching a hole into the middle of a file.
 485 */
 486static long madvise_remove(struct vm_area_struct *vma,
 487                                struct vm_area_struct **prev,
 488                                unsigned long start, unsigned long end)
 489{
 490        loff_t offset;
 491        int error;
 492        struct file *f;
 493
 494        *prev = NULL;   /* tell sys_madvise we drop mmap_sem */
 495
 496        if (vma->vm_flags & VM_LOCKED)
 497                return -EINVAL;
 498
 499        f = vma->vm_file;
 500
 501        if (!f || !f->f_mapping || !f->f_mapping->host) {
 502                        return -EINVAL;
 503        }
 504
 505        if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
 506                return -EACCES;
 507
 508        offset = (loff_t)(start - vma->vm_start)
 509                        + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
 510
 511        /*
 512         * Filesystem's fallocate may need to take i_mutex.  We need to
 513         * explicitly grab a reference because the vma (and hence the
 514         * vma's reference to the file) can go away as soon as we drop
 515         * mmap_sem.
 516         */
 517        get_file(f);
 518        up_read(&current->mm->mmap_sem);
 519        error = vfs_fallocate(f,
 520                                FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
 521                                offset, end - start);
 522        fput(f);
 523        down_read(&current->mm->mmap_sem);
 524        return error;
 525}
 526
 527#ifdef CONFIG_MEMORY_FAILURE
 528/*
 529 * Error injection support for memory error handling.
 530 */
 531static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end)
 532{
 533        struct page *p;
 534        if (!capable(CAP_SYS_ADMIN))
 535                return -EPERM;
 536        for (; start < end; start += PAGE_SIZE <<
 537                                compound_order(compound_head(p))) {
 538                int ret;
 539
 540                ret = get_user_pages_fast(start, 1, 0, &p);
 541                if (ret != 1)
 542                        return ret;
 543
 544                if (PageHWPoison(p)) {
 545                        put_page(p);
 546                        continue;
 547                }
 548                if (bhv == MADV_SOFT_OFFLINE) {
 549                        pr_info("Soft offlining page %#lx at %#lx\n",
 550                                page_to_pfn(p), start);
 551                        ret = soft_offline_page(p, MF_COUNT_INCREASED);
 552                        if (ret)
 553                                return ret;
 554                        continue;
 555                }
 556                pr_info("Injecting memory failure for page %#lx at %#lx\n",
 557                       page_to_pfn(p), start);
 558                ret = memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED);
 559                if (ret)
 560                        return ret;
 561        }
 562        return 0;
 563}
 564#endif
 565
 566static long
 567madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
 568                unsigned long start, unsigned long end, int behavior)
 569{
 570        switch (behavior) {
 571        case MADV_REMOVE:
 572                return madvise_remove(vma, prev, start, end);
 573        case MADV_WILLNEED:
 574                return madvise_willneed(vma, prev, start, end);
 575        case MADV_FREE:
 576                /*
 577                 * XXX: In this implementation, MADV_FREE works like
 578                 * MADV_DONTNEED on swapless system or full swap.
 579                 */
 580                if (get_nr_swap_pages() > 0)
 581                        return madvise_free(vma, prev, start, end);
 582                /* passthrough */
 583        case MADV_DONTNEED:
 584                return madvise_dontneed(vma, prev, start, end);
 585        default:
 586                return madvise_behavior(vma, prev, start, end, behavior);
 587        }
 588}
 589
 590static bool
 591madvise_behavior_valid(int behavior)
 592{
 593        switch (behavior) {
 594        case MADV_DOFORK:
 595        case MADV_DONTFORK:
 596        case MADV_NORMAL:
 597        case MADV_SEQUENTIAL:
 598        case MADV_RANDOM:
 599        case MADV_REMOVE:
 600        case MADV_WILLNEED:
 601        case MADV_DONTNEED:
 602        case MADV_FREE:
 603#ifdef CONFIG_KSM
 604        case MADV_MERGEABLE:
 605        case MADV_UNMERGEABLE:
 606#endif
 607#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 608        case MADV_HUGEPAGE:
 609        case MADV_NOHUGEPAGE:
 610#endif
 611        case MADV_DONTDUMP:
 612        case MADV_DODUMP:
 613                return true;
 614
 615        default:
 616                return false;
 617        }
 618}
 619
 620/*
 621 * The madvise(2) system call.
 622 *
 623 * Applications can use madvise() to advise the kernel how it should
 624 * handle paging I/O in this VM area.  The idea is to help the kernel
 625 * use appropriate read-ahead and caching techniques.  The information
 626 * provided is advisory only, and can be safely disregarded by the
 627 * kernel without affecting the correct operation of the application.
 628 *
 629 * behavior values:
 630 *  MADV_NORMAL - the default behavior is to read clusters.  This
 631 *              results in some read-ahead and read-behind.
 632 *  MADV_RANDOM - the system should read the minimum amount of data
 633 *              on any access, since it is unlikely that the appli-
 634 *              cation will need more than what it asks for.
 635 *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
 636 *              once, so they can be aggressively read ahead, and
 637 *              can be freed soon after they are accessed.
 638 *  MADV_WILLNEED - the application is notifying the system to read
 639 *              some pages ahead.
 640 *  MADV_DONTNEED - the application is finished with the given range,
 641 *              so the kernel can free resources associated with it.
 642 *  MADV_FREE - the application marks pages in the given range as lazy free,
 643 *              where actual purges are postponed until memory pressure happens.
 644 *  MADV_REMOVE - the application wants to free up the given range of
 645 *              pages and associated backing store.
 646 *  MADV_DONTFORK - omit this area from child's address space when forking:
 647 *              typically, to avoid COWing pages pinned by get_user_pages().
 648 *  MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
 649 *  MADV_HWPOISON - trigger memory error handler as if the given memory range
 650 *              were corrupted by unrecoverable hardware memory failure.
 651 *  MADV_SOFT_OFFLINE - try to soft-offline the given range of memory.
 652 *  MADV_MERGEABLE - the application recommends that KSM try to merge pages in
 653 *              this area with pages of identical content from other such areas.
 654 *  MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
 655 *  MADV_HUGEPAGE - the application wants to back the given range by transparent
 656 *              huge pages in the future. Existing pages might be coalesced and
 657 *              new pages might be allocated as THP.
 658 *  MADV_NOHUGEPAGE - mark the given range as not worth being backed by
 659 *              transparent huge pages so the existing pages will not be
 660 *              coalesced into THP and new pages will not be allocated as THP.
 661 *  MADV_DONTDUMP - the application wants to prevent pages in the given range
 662 *              from being included in its core dump.
 663 *  MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
 664 *
 665 * return values:
 666 *  zero    - success
 667 *  -EINVAL - start + len < 0, start is not page-aligned,
 668 *              "behavior" is not a valid value, or application
 669 *              is attempting to release locked or shared pages.
 670 *  -ENOMEM - addresses in the specified range are not currently
 671 *              mapped, or are outside the AS of the process.
 672 *  -EIO    - an I/O error occurred while paging in data.
 673 *  -EBADF  - map exists, but area maps something that isn't a file.
 674 *  -EAGAIN - a kernel resource was temporarily unavailable.
 675 */
 676SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
 677{
 678        unsigned long end, tmp;
 679        struct vm_area_struct *vma, *prev;
 680        int unmapped_error = 0;
 681        int error = -EINVAL;
 682        int write;
 683        size_t len;
 684        struct blk_plug plug;
 685
 686#ifdef CONFIG_MEMORY_FAILURE
 687        if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
 688                return madvise_hwpoison(behavior, start, start+len_in);
 689#endif
 690        if (!madvise_behavior_valid(behavior))
 691                return error;
 692
 693        if (start & ~PAGE_MASK)
 694                return error;
 695        len = (len_in + ~PAGE_MASK) & PAGE_MASK;
 696
 697        /* Check to see whether len was rounded up from small -ve to zero */
 698        if (len_in && !len)
 699                return error;
 700
 701        end = start + len;
 702        if (end < start)
 703                return error;
 704
 705        error = 0;
 706        if (end == start)
 707                return error;
 708
 709        write = madvise_need_mmap_write(behavior);
 710        if (write)
 711                down_write(&current->mm->mmap_sem);
 712        else
 713                down_read(&current->mm->mmap_sem);
 714
 715        /*
 716         * If the interval [start,end) covers some unmapped address
 717         * ranges, just ignore them, but return -ENOMEM at the end.
 718         * - different from the way of handling in mlock etc.
 719         */
 720        vma = find_vma_prev(current->mm, start, &prev);
 721        if (vma && start > vma->vm_start)
 722                prev = vma;
 723
 724        blk_start_plug(&plug);
 725        for (;;) {
 726                /* Still start < end. */
 727                error = -ENOMEM;
 728                if (!vma)
 729                        goto out;
 730
 731                /* Here start < (end|vma->vm_end). */
 732                if (start < vma->vm_start) {
 733                        unmapped_error = -ENOMEM;
 734                        start = vma->vm_start;
 735                        if (start >= end)
 736                                goto out;
 737                }
 738
 739                /* Here vma->vm_start <= start < (end|vma->vm_end) */
 740                tmp = vma->vm_end;
 741                if (end < tmp)
 742                        tmp = end;
 743
 744                /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
 745                error = madvise_vma(vma, &prev, start, tmp, behavior);
 746                if (error)
 747                        goto out;
 748                start = tmp;
 749                if (prev && start < prev->vm_end)
 750                        start = prev->vm_end;
 751                error = unmapped_error;
 752                if (start >= end)
 753                        goto out;
 754                if (prev)
 755                        vma = prev->vm_next;
 756                else    /* madvise_remove dropped mmap_sem */
 757                        vma = find_vma(current->mm, start);
 758        }
 759out:
 760        blk_finish_plug(&plug);
 761        if (write)
 762                up_write(&current->mm->mmap_sem);
 763        else
 764                up_read(&current->mm->mmap_sem);
 765
 766        return error;
 767}
 768