LXR linux/mm/madvise.c

   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 *      linux/mm/madvise.c
   4 *
   5 * Copyright (C) 1999  Linus Torvalds
   6 * Copyright (C) 2002  Christoph Hellwig
   7 */
   8
   9#include <linux/mman.h>
  10#include <linux/pagemap.h>
  11#include <linux/syscalls.h>
  12#include <linux/mempolicy.h>
  13#include <linux/page-isolation.h>
  14#include <linux/page_idle.h>
  15#include <linux/userfaultfd_k.h>
  16#include <linux/hugetlb.h>
  17#include <linux/falloc.h>
  18#include <linux/fadvise.h>
  19#include <linux/sched.h>
  20#include <linux/ksm.h>
  21#include <linux/fs.h>
  22#include <linux/file.h>
  23#include <linux/blkdev.h>
  24#include <linux/backing-dev.h>
  25#include <linux/pagewalk.h>
  26#include <linux/swap.h>
  27#include <linux/swapops.h>
  28#include <linux/shmem_fs.h>
  29#include <linux/mmu_notifier.h>
  30
  31#include <asm/tlb.h>
  32
  33#include "internal.h"
  34
  35struct madvise_walk_private {
  36        struct mmu_gather *tlb;
  37        bool pageout;
  38};
  39
  40/*
  41 * Any behaviour which results in changes to the vma->vm_flags needs to
  42 * take mmap_sem for writing. Others, which simply traverse vmas, need
  43 * to only take it for reading.
  44 */
  45static int madvise_need_mmap_write(int behavior)
  46{
  47        switch (behavior) {
  48        case MADV_REMOVE:
  49        case MADV_WILLNEED:
  50        case MADV_DONTNEED:
  51        case MADV_COLD:
  52        case MADV_PAGEOUT:
  53        case MADV_FREE:
  54                return 0;
  55        default:
  56                /* be safe, default to 1. list exceptions explicitly */
  57                return 1;
  58        }
  59}
  60
  61/*
  62 * We can potentially split a vm area into separate
  63 * areas, each area with its own behavior.
  64 */
  65static long madvise_behavior(struct vm_area_struct *vma,
  66                     struct vm_area_struct **prev,
  67                     unsigned long start, unsigned long end, int behavior)
  68{
  69        struct mm_struct *mm = vma->vm_mm;
  70        int error = 0;
  71        pgoff_t pgoff;
  72        unsigned long new_flags = vma->vm_flags;
  73
  74        switch (behavior) {
  75        case MADV_NORMAL:
  76                new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
  77                break;
  78        case MADV_SEQUENTIAL:
  79                new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
  80                break;
  81        case MADV_RANDOM:
  82                new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
  83                break;
  84        case MADV_DONTFORK:
  85                new_flags |= VM_DONTCOPY;
  86                break;
  87        case MADV_DOFORK:
  88                if (vma->vm_flags & VM_IO) {
  89                        error = -EINVAL;
  90                        goto out;
  91                }
  92                new_flags &= ~VM_DONTCOPY;
  93                break;
  94        case MADV_WIPEONFORK:
  95                /* MADV_WIPEONFORK is only supported on anonymous memory. */
  96                if (vma->vm_file || vma->vm_flags & VM_SHARED) {
  97                        error = -EINVAL;
  98                        goto out;
  99                }
 100                new_flags |= VM_WIPEONFORK;
 101                break;
 102        case MADV_KEEPONFORK:
 103                new_flags &= ~VM_WIPEONFORK;
 104                break;
 105        case MADV_DONTDUMP:
 106                new_flags |= VM_DONTDUMP;
 107                break;
 108        case MADV_DODUMP:
 109                if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) {
 110                        error = -EINVAL;
 111                        goto out;
 112                }
 113                new_flags &= ~VM_DONTDUMP;
 114                break;
 115        case MADV_MERGEABLE:
 116        case MADV_UNMERGEABLE:
 117                error = ksm_madvise(vma, start, end, behavior, &new_flags);
 118                if (error)
 119                        goto out_convert_errno;
 120                break;
 121        case MADV_HUGEPAGE:
 122        case MADV_NOHUGEPAGE:
 123                error = hugepage_madvise(vma, &new_flags, behavior);
 124                if (error)
 125                        goto out_convert_errno;
 126                break;
 127        }
 128
 129        if (new_flags == vma->vm_flags) {
 130                *prev = vma;
 131                goto out;
 132        }
 133
 134        pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
 135        *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
 136                          vma->vm_file, pgoff, vma_policy(vma),
 137                          vma->vm_userfaultfd_ctx);
 138        if (*prev) {
 139                vma = *prev;
 140                goto success;
 141        }
 142
 143        *prev = vma;
 144
 145        if (start != vma->vm_start) {
 146                if (unlikely(mm->map_count >= sysctl_max_map_count)) {
 147                        error = -ENOMEM;
 148                        goto out;
 149                }
 150                error = __split_vma(mm, vma, start, 1);
 151                if (error)
 152                        goto out_convert_errno;
 153        }
 154
 155        if (end != vma->vm_end) {
 156                if (unlikely(mm->map_count >= sysctl_max_map_count)) {
 157                        error = -ENOMEM;
 158                        goto out;
 159                }
 160                error = __split_vma(mm, vma, end, 0);
 161                if (error)
 162                        goto out_convert_errno;
 163        }
 164
 165success:
 166        /*
 167         * vm_flags is protected by the mmap_sem held in write mode.
 168         */
 169        vma->vm_flags = new_flags;
 170
 171out_convert_errno:
 172        /*
 173         * madvise() returns EAGAIN if kernel resources, such as
 174         * slab, are temporarily unavailable.
 175         */
 176        if (error == -ENOMEM)
 177                error = -EAGAIN;
 178out:
 179        return error;
 180}
 181
 182#ifdef CONFIG_SWAP
 183static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
 184        unsigned long end, struct mm_walk *walk)
 185{
 186        pte_t *orig_pte;
 187        struct vm_area_struct *vma = walk->private;
 188        unsigned long index;
 189
 190        if (pmd_none_or_trans_huge_or_clear_bad(pmd))
 191                return 0;
 192
 193        for (index = start; index != end; index += PAGE_SIZE) {
 194                pte_t pte;
 195                swp_entry_t entry;
 196                struct page *page;
 197                spinlock_t *ptl;
 198
 199                orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
 200                pte = *(orig_pte + ((index - start) / PAGE_SIZE));
 201                pte_unmap_unlock(orig_pte, ptl);
 202
 203                if (pte_present(pte) || pte_none(pte))
 204                        continue;
 205                entry = pte_to_swp_entry(pte);
 206                if (unlikely(non_swap_entry(entry)))
 207                        continue;
 208
 209                page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
 210                                                        vma, index, false);
 211                if (page)
 212                        put_page(page);
 213        }
 214
 215        return 0;
 216}
 217
 218static const struct mm_walk_ops swapin_walk_ops = {
 219        .pmd_entry              = swapin_walk_pmd_entry,
 220};
 221
 222static void force_shm_swapin_readahead(struct vm_area_struct *vma,
 223                unsigned long start, unsigned long end,
 224                struct address_space *mapping)
 225{
 226        pgoff_t index;
 227        struct page *page;
 228        swp_entry_t swap;
 229
 230        for (; start < end; start += PAGE_SIZE) {
 231                index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
 232
 233                page = find_get_entry(mapping, index);
 234                if (!xa_is_value(page)) {
 235                        if (page)
 236                                put_page(page);
 237                        continue;
 238                }
 239                swap = radix_to_swp_entry(page);
 240                page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
 241                                                        NULL, 0, false);
 242                if (page)
 243                        put_page(page);
 244        }
 245
 246        lru_add_drain();        /* Push any new pages onto the LRU now */
 247}
 248#endif          /* CONFIG_SWAP */
 249
 250/*
 251 * Schedule all required I/O operations.  Do not wait for completion.
 252 */
 253static long madvise_willneed(struct vm_area_struct *vma,
 254                             struct vm_area_struct **prev,
 255                             unsigned long start, unsigned long end)
 256{
 257        struct file *file = vma->vm_file;
 258        loff_t offset;
 259
 260        *prev = vma;
 261#ifdef CONFIG_SWAP
 262        if (!file) {
 263                walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma);
 264                lru_add_drain(); /* Push any new pages onto the LRU now */
 265                return 0;
 266        }
 267
 268        if (shmem_mapping(file->f_mapping)) {
 269                force_shm_swapin_readahead(vma, start, end,
 270                                        file->f_mapping);
 271                return 0;
 272        }
 273#else
 274        if (!file)
 275                return -EBADF;
 276#endif
 277
 278        if (IS_DAX(file_inode(file))) {
 279                /* no bad return value, but ignore advice */
 280                return 0;
 281        }
 282
 283        /*
 284         * Filesystem's fadvise may need to take various locks.  We need to
 285         * explicitly grab a reference because the vma (and hence the
 286         * vma's reference to the file) can go away as soon as we drop
 287         * mmap_sem.
 288         */
 289        *prev = NULL;   /* tell sys_madvise we drop mmap_sem */
 290        get_file(file);
 291        up_read(&current->mm->mmap_sem);
 292        offset = (loff_t)(start - vma->vm_start)
 293                        + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
 294        vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
 295        fput(file);
 296        down_read(&current->mm->mmap_sem);
 297        return 0;
 298}
 299
 300static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 301                                unsigned long addr, unsigned long end,
 302                                struct mm_walk *walk)
 303{
 304        struct madvise_walk_private *private = walk->private;
 305        struct mmu_gather *tlb = private->tlb;
 306        bool pageout = private->pageout;
 307        struct mm_struct *mm = tlb->mm;
 308        struct vm_area_struct *vma = walk->vma;
 309        pte_t *orig_pte, *pte, ptent;
 310        spinlock_t *ptl;
 311        struct page *page = NULL;
 312        LIST_HEAD(page_list);
 313
 314        if (fatal_signal_pending(current))
 315                return -EINTR;
 316
 317#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 318        if (pmd_trans_huge(*pmd)) {
 319                pmd_t orig_pmd;
 320                unsigned long next = pmd_addr_end(addr, end);
 321
 322                tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
 323                ptl = pmd_trans_huge_lock(pmd, vma);
 324                if (!ptl)
 325                        return 0;
 326
 327                orig_pmd = *pmd;
 328                if (is_huge_zero_pmd(orig_pmd))
 329                        goto huge_unlock;
 330
 331                if (unlikely(!pmd_present(orig_pmd))) {
 332                        VM_BUG_ON(thp_migration_supported() &&
 333                                        !is_pmd_migration_entry(orig_pmd));
 334                        goto huge_unlock;
 335                }
 336
 337                page = pmd_page(orig_pmd);
 338                if (next - addr != HPAGE_PMD_SIZE) {
 339                        int err;
 340
 341                        if (page_mapcount(page) != 1)
 342                                goto huge_unlock;
 343
 344                        get_page(page);
 345                        spin_unlock(ptl);
 346                        lock_page(page);
 347                        err = split_huge_page(page);
 348                        unlock_page(page);
 349                        put_page(page);
 350                        if (!err)
 351                                goto regular_page;
 352                        return 0;
 353                }
 354
 355                if (pmd_young(orig_pmd)) {
 356                        pmdp_invalidate(vma, addr, pmd);
 357                        orig_pmd = pmd_mkold(orig_pmd);
 358
 359                        set_pmd_at(mm, addr, pmd, orig_pmd);
 360                        tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
 361                }
 362
 363                ClearPageReferenced(page);
 364                test_and_clear_page_young(page);
 365                if (pageout) {
 366                        if (!isolate_lru_page(page)) {
 367                                if (PageUnevictable(page))
 368                                        putback_lru_page(page);
 369                                else
 370                                        list_add(&page->lru, &page_list);
 371                        }
 372                } else
 373                        deactivate_page(page);
 374huge_unlock:
 375                spin_unlock(ptl);
 376                if (pageout)
 377                        reclaim_pages(&page_list);
 378                return 0;
 379        }
 380
 381        if (pmd_trans_unstable(pmd))
 382                return 0;
 383regular_page:
 384#endif
 385        tlb_change_page_size(tlb, PAGE_SIZE);
 386        orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 387        flush_tlb_batched_pending(mm);
 388        arch_enter_lazy_mmu_mode();
 389        for (; addr < end; pte++, addr += PAGE_SIZE) {
 390                ptent = *pte;
 391
 392                if (pte_none(ptent))
 393                        continue;
 394
 395                if (!pte_present(ptent))
 396                        continue;
 397
 398                page = vm_normal_page(vma, addr, ptent);
 399                if (!page)
 400                        continue;
 401
 402                /*
 403                 * Creating a THP page is expensive so split it only if we
 404                 * are sure it's worth. Split it if we are only owner.
 405                 */
 406                if (PageTransCompound(page)) {
 407                        if (page_mapcount(page) != 1)
 408                                break;
 409                        get_page(page);
 410                        if (!trylock_page(page)) {
 411                                put_page(page);
 412                                break;
 413                        }
 414                        pte_unmap_unlock(orig_pte, ptl);
 415                        if (split_huge_page(page)) {
 416                                unlock_page(page);
 417                                put_page(page);
 418                                pte_offset_map_lock(mm, pmd, addr, &ptl);
 419                                break;
 420                        }
 421                        unlock_page(page);
 422                        put_page(page);
 423                        pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 424                        pte--;
 425                        addr -= PAGE_SIZE;
 426                        continue;
 427                }
 428
 429                VM_BUG_ON_PAGE(PageTransCompound(page), page);
 430
 431                if (pte_young(ptent)) {
 432                        ptent = ptep_get_and_clear_full(mm, addr, pte,
 433                                                        tlb->fullmm);
 434                        ptent = pte_mkold(ptent);
 435                        set_pte_at(mm, addr, pte, ptent);
 436                        tlb_remove_tlb_entry(tlb, pte, addr);
 437                }
 438
 439                /*
 440                 * We are deactivating a page for accelerating reclaiming.
 441                 * VM couldn't reclaim the page unless we clear PG_young.
 442                 * As a side effect, it makes confuse idle-page tracking
 443                 * because they will miss recent referenced history.
 444                 */
 445                ClearPageReferenced(page);
 446                test_and_clear_page_young(page);
 447                if (pageout) {
 448                        if (!isolate_lru_page(page)) {
 449                                if (PageUnevictable(page))
 450                                        putback_lru_page(page);
 451                                else
 452                                        list_add(&page->lru, &page_list);
 453                        }
 454                } else
 455                        deactivate_page(page);
 456        }
 457
 458        arch_leave_lazy_mmu_mode();
 459        pte_unmap_unlock(orig_pte, ptl);
 460        if (pageout)
 461                reclaim_pages(&page_list);
 462        cond_resched();
 463
 464        return 0;
 465}
 466
 467static const struct mm_walk_ops cold_walk_ops = {
 468        .pmd_entry = madvise_cold_or_pageout_pte_range,
 469};
 470
 471static void madvise_cold_page_range(struct mmu_gather *tlb,
 472                             struct vm_area_struct *vma,
 473                             unsigned long addr, unsigned long end)
 474{
 475        struct madvise_walk_private walk_private = {
 476                .pageout = false,
 477                .tlb = tlb,
 478        };
 479
 480        tlb_start_vma(tlb, vma);
 481        walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
 482        tlb_end_vma(tlb, vma);
 483}
 484
 485static long madvise_cold(struct vm_area_struct *vma,
 486                        struct vm_area_struct **prev,
 487                        unsigned long start_addr, unsigned long end_addr)
 488{
 489        struct mm_struct *mm = vma->vm_mm;
 490        struct mmu_gather tlb;
 491
 492        *prev = vma;
 493        if (!can_madv_lru_vma(vma))
 494                return -EINVAL;
 495
 496        lru_add_drain();
 497        tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
 498        madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
 499        tlb_finish_mmu(&tlb, start_addr, end_addr);
 500
 501        return 0;
 502}
 503
 504static void madvise_pageout_page_range(struct mmu_gather *tlb,
 505                             struct vm_area_struct *vma,
 506                             unsigned long addr, unsigned long end)
 507{
 508        struct madvise_walk_private walk_private = {
 509                .pageout = true,
 510                .tlb = tlb,
 511        };
 512
 513        tlb_start_vma(tlb, vma);
 514        walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
 515        tlb_end_vma(tlb, vma);
 516}
 517
 518static inline bool can_do_pageout(struct vm_area_struct *vma)
 519{
 520        if (vma_is_anonymous(vma))
 521                return true;
 522        if (!vma->vm_file)
 523                return false;
 524        /*
 525         * paging out pagecache only for non-anonymous mappings that correspond
 526         * to the files the calling process could (if tried) open for writing;
 527         * otherwise we'd be including shared non-exclusive mappings, which
 528         * opens a side channel.
 529         */
 530        return inode_owner_or_capable(file_inode(vma->vm_file)) ||
 531                inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0;
 532}
 533
 534static long madvise_pageout(struct vm_area_struct *vma,
 535                        struct vm_area_struct **prev,
 536                        unsigned long start_addr, unsigned long end_addr)
 537{
 538        struct mm_struct *mm = vma->vm_mm;
 539        struct mmu_gather tlb;
 540
 541        *prev = vma;
 542        if (!can_madv_lru_vma(vma))
 543                return -EINVAL;
 544
 545        if (!can_do_pageout(vma))
 546                return 0;
 547
 548        lru_add_drain();
 549        tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
 550        madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
 551        tlb_finish_mmu(&tlb, start_addr, end_addr);
 552
 553        return 0;
 554}
 555
 556static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 557                                unsigned long end, struct mm_walk *walk)
 558
 559{
 560        struct mmu_gather *tlb = walk->private;
 561        struct mm_struct *mm = tlb->mm;
 562        struct vm_area_struct *vma = walk->vma;
 563        spinlock_t *ptl;
 564        pte_t *orig_pte, *pte, ptent;
 565        struct page *page;
 566        int nr_swap = 0;
 567        unsigned long next;
 568
 569        next = pmd_addr_end(addr, end);
 570        if (pmd_trans_huge(*pmd))
 571                if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
 572                        goto next;
 573
 574        if (pmd_trans_unstable(pmd))
 575                return 0;
 576
 577        tlb_change_page_size(tlb, PAGE_SIZE);
 578        orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 579        flush_tlb_batched_pending(mm);
 580        arch_enter_lazy_mmu_mode();
 581        for (; addr != end; pte++, addr += PAGE_SIZE) {
 582                ptent = *pte;
 583
 584                if (pte_none(ptent))
 585                        continue;
 586                /*
 587                 * If the pte has swp_entry, just clear page table to
 588                 * prevent swap-in which is more expensive rather than
 589                 * (page allocation + zeroing).
 590                 */
 591                if (!pte_present(ptent)) {
 592                        swp_entry_t entry;
 593
 594                        entry = pte_to_swp_entry(ptent);
 595                        if (non_swap_entry(entry))
 596                                continue;
 597                        nr_swap--;
 598                        free_swap_and_cache(entry);
 599                        pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
 600                        continue;
 601                }
 602
 603                page = vm_normal_page(vma, addr, ptent);
 604                if (!page)
 605                        continue;
 606
 607                /*
 608                 * If pmd isn't transhuge but the page is THP and
 609                 * is owned by only this process, split it and
 610                 * deactivate all pages.
 611                 */
 612                if (PageTransCompound(page)) {
 613                        if (page_mapcount(page) != 1)
 614                                goto out;
 615                        get_page(page);
 616                        if (!trylock_page(page)) {
 617                                put_page(page);
 618                                goto out;
 619                        }
 620                        pte_unmap_unlock(orig_pte, ptl);
 621                        if (split_huge_page(page)) {
 622                                unlock_page(page);
 623                                put_page(page);
 624                                pte_offset_map_lock(mm, pmd, addr, &ptl);
 625                                goto out;
 626                        }
 627                        unlock_page(page);
 628                        put_page(page);
 629                        pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 630                        pte--;
 631                        addr -= PAGE_SIZE;
 632                        continue;
 633                }
 634
 635                VM_BUG_ON_PAGE(PageTransCompound(page), page);
 636
 637                if (PageSwapCache(page) || PageDirty(page)) {
 638                        if (!trylock_page(page))
 639                                continue;
 640                        /*
 641                         * If page is shared with others, we couldn't clear
 642                         * PG_dirty of the page.
 643                         */
 644                        if (page_mapcount(page) != 1) {
 645                                unlock_page(page);
 646                                continue;
 647                        }
 648
 649                        if (PageSwapCache(page) && !try_to_free_swap(page)) {
 650                                unlock_page(page);
 651                                continue;
 652                        }
 653
 654                        ClearPageDirty(page);
 655                        unlock_page(page);
 656                }
 657
 658                if (pte_young(ptent) || pte_dirty(ptent)) {
 659                        /*
 660                         * Some of architecture(ex, PPC) don't update TLB
 661                         * with set_pte_at and tlb_remove_tlb_entry so for
 662                         * the portability, remap the pte with old|clean
 663                         * after pte clearing.
 664                         */
 665                        ptent = ptep_get_and_clear_full(mm, addr, pte,
 666                                                        tlb->fullmm);
 667
 668                        ptent = pte_mkold(ptent);
 669                        ptent = pte_mkclean(ptent);
 670                        set_pte_at(mm, addr, pte, ptent);
 671                        tlb_remove_tlb_entry(tlb, pte, addr);
 672                }
 673                mark_page_lazyfree(page);
 674        }
 675out:
 676        if (nr_swap) {
 677                if (current->mm == mm)
 678                        sync_mm_rss(mm);
 679
 680                add_mm_counter(mm, MM_SWAPENTS, nr_swap);
 681        }
 682        arch_leave_lazy_mmu_mode();
 683        pte_unmap_unlock(orig_pte, ptl);
 684        cond_resched();
 685next:
 686        return 0;
 687}
 688
 689static const struct mm_walk_ops madvise_free_walk_ops = {
 690        .pmd_entry              = madvise_free_pte_range,
 691};
 692
 693static int madvise_free_single_vma(struct vm_area_struct *vma,
 694                        unsigned long start_addr, unsigned long end_addr)
 695{
 696        struct mm_struct *mm = vma->vm_mm;
 697        struct mmu_notifier_range range;
 698        struct mmu_gather tlb;
 699
 700        /* MADV_FREE works for only anon vma at the moment */
 701        if (!vma_is_anonymous(vma))
 702                return -EINVAL;
 703
 704        range.start = max(vma->vm_start, start_addr);
 705        if (range.start >= vma->vm_end)
 706                return -EINVAL;
 707        range.end = min(vma->vm_end, end_addr);
 708        if (range.end <= vma->vm_start)
 709                return -EINVAL;
 710        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
 711                                range.start, range.end);
 712
 713        lru_add_drain();
 714        tlb_gather_mmu(&tlb, mm, range.start, range.end);
 715        update_hiwater_rss(mm);
 716
 717        mmu_notifier_invalidate_range_start(&range);
 718        tlb_start_vma(&tlb, vma);
 719        walk_page_range(vma->vm_mm, range.start, range.end,
 720                        &madvise_free_walk_ops, &tlb);
 721        tlb_end_vma(&tlb, vma);
 722        mmu_notifier_invalidate_range_end(&range);
 723        tlb_finish_mmu(&tlb, range.start, range.end);
 724
 725        return 0;
 726}
 727
 728/*
 729 * Application no longer needs these pages.  If the pages are dirty,
 730 * it's OK to just throw them away.  The app will be more careful about
 731 * data it wants to keep.  Be sure to free swap resources too.  The
 732 * zap_page_range call sets things up for shrink_active_list to actually free
 733 * these pages later if no one else has touched them in the meantime,
 734 * although we could add these pages to a global reuse list for
 735 * shrink_active_list to pick up before reclaiming other pages.
 736 *
 737 * NB: This interface discards data rather than pushes it out to swap,
 738 * as some implementations do.  This has performance implications for
 739 * applications like large transactional databases which want to discard
 740 * pages in anonymous maps after committing to backing store the data
 741 * that was kept in them.  There is no reason to write this data out to
 742 * the swap area if the application is discarding it.
 743 *
 744 * An interface that causes the system to free clean pages and flush
 745 * dirty pages is already available as msync(MS_INVALIDATE).
 746 */
 747static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
 748                                        unsigned long start, unsigned long end)
 749{
 750        zap_page_range(vma, start, end - start);
 751        return 0;
 752}
 753
 754static long madvise_dontneed_free(struct vm_area_struct *vma,
 755                                  struct vm_area_struct **prev,
 756                                  unsigned long start, unsigned long end,
 757                                  int behavior)
 758{
 759        *prev = vma;
 760        if (!can_madv_lru_vma(vma))
 761                return -EINVAL;
 762
 763        if (!userfaultfd_remove(vma, start, end)) {
 764                *prev = NULL; /* mmap_sem has been dropped, prev is stale */
 765
 766                down_read(&current->mm->mmap_sem);
 767                vma = find_vma(current->mm, start);
 768                if (!vma)
 769                        return -ENOMEM;
 770                if (start < vma->vm_start) {
 771                        /*
 772                         * This "vma" under revalidation is the one
 773                         * with the lowest vma->vm_start where start
 774                         * is also < vma->vm_end. If start <
 775                         * vma->vm_start it means an hole materialized
 776                         * in the user address space within the
 777                         * virtual range passed to MADV_DONTNEED
 778                         * or MADV_FREE.
 779                         */
 780                        return -ENOMEM;
 781                }
 782                if (!can_madv_lru_vma(vma))
 783                        return -EINVAL;
 784                if (end > vma->vm_end) {
 785                        /*
 786                         * Don't fail if end > vma->vm_end. If the old
 787                         * vma was splitted while the mmap_sem was
 788                         * released the effect of the concurrent
 789                         * operation may not cause madvise() to
 790                         * have an undefined result. There may be an
 791                         * adjacent next vma that we'll walk
 792                         * next. userfaultfd_remove() will generate an
 793                         * UFFD_EVENT_REMOVE repetition on the
 794                         * end-vma->vm_end range, but the manager can
 795                         * handle a repetition fine.
 796                         */
 797                        end = vma->vm_end;
 798                }
 799                VM_WARN_ON(start >= end);
 800        }
 801
 802        if (behavior == MADV_DONTNEED)
 803                return madvise_dontneed_single_vma(vma, start, end);
 804        else if (behavior == MADV_FREE)
 805                return madvise_free_single_vma(vma, start, end);
 806        else
 807                return -EINVAL;
 808}
 809
 810/*
 811 * Application wants to free up the pages and associated backing store.
 812 * This is effectively punching a hole into the middle of a file.
 813 */
 814static long madvise_remove(struct vm_area_struct *vma,
 815                                struct vm_area_struct **prev,
 816                                unsigned long start, unsigned long end)
 817{
 818        loff_t offset;
 819        int error;
 820        struct file *f;
 821
 822        *prev = NULL;   /* tell sys_madvise we drop mmap_sem */
 823
 824        if (vma->vm_flags & VM_LOCKED)
 825                return -EINVAL;
 826
 827        f = vma->vm_file;
 828
 829        if (!f || !f->f_mapping || !f->f_mapping->host) {
 830                        return -EINVAL;
 831        }
 832
 833        if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
 834                return -EACCES;
 835
 836        offset = (loff_t)(start - vma->vm_start)
 837                        + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
 838
 839        /*
 840         * Filesystem's fallocate may need to take i_mutex.  We need to
 841         * explicitly grab a reference because the vma (and hence the
 842         * vma's reference to the file) can go away as soon as we drop
 843         * mmap_sem.
 844         */
 845        get_file(f);
 846        if (userfaultfd_remove(vma, start, end)) {
 847                /* mmap_sem was not released by userfaultfd_remove() */
 848                up_read(&current->mm->mmap_sem);
 849        }
 850        error = vfs_fallocate(f,
 851                                FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
 852                                offset, end - start);
 853        fput(f);
 854        down_read(&current->mm->mmap_sem);
 855        return error;
 856}
 857
 858#ifdef CONFIG_MEMORY_FAILURE
 859/*
 860 * Error injection support for memory error handling.
 861 */
 862static int madvise_inject_error(int behavior,
 863                unsigned long start, unsigned long end)
 864{
 865        struct page *page;
 866        struct zone *zone;
 867        unsigned int order;
 868
 869        if (!capable(CAP_SYS_ADMIN))
 870                return -EPERM;
 871
 872
 873        for (; start < end; start += PAGE_SIZE << order) {
 874                unsigned long pfn;
 875                int ret;
 876
 877                ret = get_user_pages_fast(start, 1, 0, &page);
 878                if (ret != 1)
 879                        return ret;
 880                pfn = page_to_pfn(page);
 881
 882                /*
 883                 * When soft offlining hugepages, after migrating the page
 884                 * we dissolve it, therefore in the second loop "page" will
 885                 * no longer be a compound page, and order will be 0.
 886                 */
 887                order = compound_order(compound_head(page));
 888
 889                if (PageHWPoison(page)) {
 890                        put_page(page);
 891                        continue;
 892                }
 893
 894                if (behavior == MADV_SOFT_OFFLINE) {
 895                        pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
 896                                        pfn, start);
 897
 898                        ret = soft_offline_page(page, MF_COUNT_INCREASED);
 899                        if (ret)
 900                                return ret;
 901                        continue;
 902                }
 903
 904                pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
 905                                pfn, start);
 906
 907                /*
 908                 * Drop the page reference taken by get_user_pages_fast(). In
 909                 * the absence of MF_COUNT_INCREASED the memory_failure()
 910                 * routine is responsible for pinning the page to prevent it
 911                 * from being released back to the page allocator.
 912                 */
 913                put_page(page);
 914                ret = memory_failure(pfn, 0);
 915                if (ret)
 916                        return ret;
 917        }
 918
 919        /* Ensure that all poisoned pages are removed from per-cpu lists */
 920        for_each_populated_zone(zone)
 921                drain_all_pages(zone);
 922
 923        return 0;
 924}
 925#endif
 926
 927static long
 928madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
 929                unsigned long start, unsigned long end, int behavior)
 930{
 931        switch (behavior) {
 932        case MADV_REMOVE:
 933                return madvise_remove(vma, prev, start, end);
 934        case MADV_WILLNEED:
 935                return madvise_willneed(vma, prev, start, end);
 936        case MADV_COLD:
 937                return madvise_cold(vma, prev, start, end);
 938        case MADV_PAGEOUT:
 939                return madvise_pageout(vma, prev, start, end);
 940        case MADV_FREE:
 941        case MADV_DONTNEED:
 942                return madvise_dontneed_free(vma, prev, start, end, behavior);
 943        default:
 944                return madvise_behavior(vma, prev, start, end, behavior);
 945        }
 946}
 947
 948static bool
 949madvise_behavior_valid(int behavior)
 950{
 951        switch (behavior) {
 952        case MADV_DOFORK:
 953        case MADV_DONTFORK:
 954        case MADV_NORMAL:
 955        case MADV_SEQUENTIAL:
 956        case MADV_RANDOM:
 957        case MADV_REMOVE:
 958        case MADV_WILLNEED:
 959        case MADV_DONTNEED:
 960        case MADV_FREE:
 961        case MADV_COLD:
 962        case MADV_PAGEOUT:
 963#ifdef CONFIG_KSM
 964        case MADV_MERGEABLE:
 965        case MADV_UNMERGEABLE:
 966#endif
 967#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 968        case MADV_HUGEPAGE:
 969        case MADV_NOHUGEPAGE:
 970#endif
 971        case MADV_DONTDUMP:
 972        case MADV_DODUMP:
 973        case MADV_WIPEONFORK:
 974        case MADV_KEEPONFORK:
 975#ifdef CONFIG_MEMORY_FAILURE
 976        case MADV_SOFT_OFFLINE:
 977        case MADV_HWPOISON:
 978#endif
 979                return true;
 980
 981        default:
 982                return false;
 983        }
 984}
 985
 986/*
 987 * The madvise(2) system call.
 988 *
 989 * Applications can use madvise() to advise the kernel how it should
 990 * handle paging I/O in this VM area.  The idea is to help the kernel
 991 * use appropriate read-ahead and caching techniques.  The information
 992 * provided is advisory only, and can be safely disregarded by the
 993 * kernel without affecting the correct operation of the application.
 994 *
 995 * behavior values:
 996 *  MADV_NORMAL - the default behavior is to read clusters.  This
 997 *              results in some read-ahead and read-behind.
 998 *  MADV_RANDOM - the system should read the minimum amount of data
 999 *              on any access, since it is unlikely that the appli-
1000 *              cation will need more than what it asks for.

1001 *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
1002 *              once, so they can be aggressively read ahead, and
1003 *              can be freed soon after they are accessed.
1004 *  MADV_WILLNEED - the application is notifying the system to read
1005 *              some pages ahead.
1006 *  MADV_DONTNEED - the application is finished with the given range,
1007 *              so the kernel can free resources associated with it.
1008 *  MADV_FREE - the application marks pages in the given range as lazy free,
1009 *              where actual purges are postponed until memory pressure happens.
1010 *  MADV_REMOVE - the application wants to free up the given range of
1011 *              pages and associated backing store.
1012 *  MADV_DONTFORK - omit this area from child's address space when forking:
1013 *              typically, to avoid COWing pages pinned by get_user_pages().
1014 *  MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
1015 *  MADV_WIPEONFORK - present the child process with zero-filled memory in this
1016 *              range after a fork.
1017 *  MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK
1018 *  MADV_HWPOISON - trigger memory error handler as if the given memory range
1019 *              were corrupted by unrecoverable hardware memory failure.
1020 *  MADV_SOFT_OFFLINE - try to soft-offline the given range of memory.
1021 *  MADV_MERGEABLE - the application recommends that KSM try to merge pages in
1022 *              this area with pages of identical content from other such areas.
1023 *  MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
1024 *  MADV_HUGEPAGE - the application wants to back the given range by transparent
1025 *              huge pages in the future. Existing pages might be coalesced and
1026 *              new pages might be allocated as THP.
1027 *  MADV_NOHUGEPAGE - mark the given range as not worth being backed by
1028 *              transparent huge pages so the existing pages will not be
1029 *              coalesced into THP and new pages will not be allocated as THP.
1030 *  MADV_DONTDUMP - the application wants to prevent pages in the given range
1031 *              from being included in its core dump.
1032 *  MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
1033 *
1034 * return values:
1035 *  zero    - success
1036 *  -EINVAL - start + len < 0, start is not page-aligned,
1037 *              "behavior" is not a valid value, or application
1038 *              is attempting to release locked or shared pages,
1039 *              or the specified address range includes file, Huge TLB,
1040 *              MAP_SHARED or VMPFNMAP range.
1041 *  -ENOMEM - addresses in the specified range are not currently
1042 *              mapped, or are outside the AS of the process.
1043 *  -EIO    - an I/O error occurred while paging in data.
1044 *  -EBADF  - map exists, but area maps something that isn't a file.
1045 *  -EAGAIN - a kernel resource was temporarily unavailable.
1046 */
1047SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
1048{
1049        unsigned long end, tmp;
1050        struct vm_area_struct *vma, *prev;
1051        int unmapped_error = 0;
1052        int error = -EINVAL;
1053        int write;
1054        size_t len;
1055        struct blk_plug plug;
1056
1057        start = untagged_addr(start);
1058
1059        if (!madvise_behavior_valid(behavior))
1060                return error;
1061
1062        if (start & ~PAGE_MASK)
1063                return error;
1064        len = (len_in + ~PAGE_MASK) & PAGE_MASK;
1065
1066        /* Check to see whether len was rounded up from small -ve to zero */
1067        if (len_in && !len)
1068                return error;
1069
1070        end = start + len;
1071        if (end < start)
1072                return error;
1073
1074        error = 0;
1075        if (end == start)
1076                return error;
1077
1078#ifdef CONFIG_MEMORY_FAILURE
1079        if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
1080                return madvise_inject_error(behavior, start, start + len_in);
1081#endif
1082
1083        write = madvise_need_mmap_write(behavior);
1084        if (write) {
1085                if (down_write_killable(&current->mm->mmap_sem))
1086                        return -EINTR;
1087        } else {
1088                down_read(&current->mm->mmap_sem);
1089        }
1090
1091        /*
1092         * If the interval [start,end) covers some unmapped address
1093         * ranges, just ignore them, but return -ENOMEM at the end.
1094         * - different from the way of handling in mlock etc.
1095         */
1096        vma = find_vma_prev(current->mm, start, &prev);
1097        if (vma && start > vma->vm_start)
1098                prev = vma;
1099
1100        blk_start_plug(&plug);
1101        for (;;) {
1102                /* Still start < end. */
1103                error = -ENOMEM;
1104                if (!vma)
1105                        goto out;
1106
1107                /* Here start < (end|vma->vm_end). */
1108                if (start < vma->vm_start) {
1109                        unmapped_error = -ENOMEM;
1110                        start = vma->vm_start;
1111                        if (start >= end)
1112                                goto out;
1113                }
1114
1115                /* Here vma->vm_start <= start < (end|vma->vm_end) */
1116                tmp = vma->vm_end;
1117                if (end < tmp)
1118                        tmp = end;
1119
1120                /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
1121                error = madvise_vma(vma, &prev, start, tmp, behavior);
1122                if (error)
1123                        goto out;
1124                start = tmp;
1125                if (prev && start < prev->vm_end)
1126                        start = prev->vm_end;
1127                error = unmapped_error;
1128                if (start >= end)
1129                        goto out;
1130                if (prev)
1131                        vma = prev->vm_next;
1132                else    /* madvise_remove dropped mmap_sem */
1133                        vma = find_vma(current->mm, start);
1134        }
1135out:
1136        blk_finish_plug(&plug);
1137        if (write)
1138                up_write(&current->mm->mmap_sem);
1139        else
1140                up_read(&current->mm->mmap_sem);
1141
1142        return error;
1143}
1144