linux/mm/madvise.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 *      linux/mm/madvise.c
   4 *
   5 * Copyright (C) 1999  Linus Torvalds
   6 * Copyright (C) 2002  Christoph Hellwig
   7 */
   8
   9#include <linux/mman.h>
  10#include <linux/pagemap.h>
  11#include <linux/syscalls.h>
  12#include <linux/mempolicy.h>
  13#include <linux/page-isolation.h>
  14#include <linux/page_idle.h>
  15#include <linux/userfaultfd_k.h>
  16#include <linux/hugetlb.h>
  17#include <linux/falloc.h>
  18#include <linux/fadvise.h>
  19#include <linux/sched.h>
  20#include <linux/sched/mm.h>
  21#include <linux/uio.h>
  22#include <linux/ksm.h>
  23#include <linux/fs.h>
  24#include <linux/file.h>
  25#include <linux/blkdev.h>
  26#include <linux/backing-dev.h>
  27#include <linux/pagewalk.h>
  28#include <linux/swap.h>
  29#include <linux/swapops.h>
  30#include <linux/shmem_fs.h>
  31#include <linux/mmu_notifier.h>
  32
  33#include <asm/tlb.h>
  34
  35#include "internal.h"
  36
  37struct madvise_walk_private {
  38        struct mmu_gather *tlb;
  39        bool pageout;
  40};
  41
  42/*
  43 * Any behaviour which results in changes to the vma->vm_flags needs to
  44 * take mmap_lock for writing. Others, which simply traverse vmas, need
  45 * to only take it for reading.
  46 */
  47static int madvise_need_mmap_write(int behavior)
  48{
  49        switch (behavior) {
  50        case MADV_REMOVE:
  51        case MADV_WILLNEED:
  52        case MADV_DONTNEED:
  53        case MADV_COLD:
  54        case MADV_PAGEOUT:
  55        case MADV_FREE:
  56        case MADV_POPULATE_READ:
  57        case MADV_POPULATE_WRITE:
  58                return 0;
  59        default:
  60                /* be safe, default to 1. list exceptions explicitly */
  61                return 1;
  62        }
  63}
  64
  65/*
  66 * We can potentially split a vm area into separate
  67 * areas, each area with its own behavior.
  68 */
  69static long madvise_behavior(struct vm_area_struct *vma,
  70                     struct vm_area_struct **prev,
  71                     unsigned long start, unsigned long end, int behavior)
  72{
  73        struct mm_struct *mm = vma->vm_mm;
  74        int error = 0;
  75        pgoff_t pgoff;
  76        unsigned long new_flags = vma->vm_flags;
  77
  78        switch (behavior) {
  79        case MADV_NORMAL:
  80                new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
  81                break;
  82        case MADV_SEQUENTIAL:
  83                new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
  84                break;
  85        case MADV_RANDOM:
  86                new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
  87                break;
  88        case MADV_DONTFORK:
  89                new_flags |= VM_DONTCOPY;
  90                break;
  91        case MADV_DOFORK:
  92                if (vma->vm_flags & VM_IO) {
  93                        error = -EINVAL;
  94                        goto out;
  95                }
  96                new_flags &= ~VM_DONTCOPY;
  97                break;
  98        case MADV_WIPEONFORK:
  99                /* MADV_WIPEONFORK is only supported on anonymous memory. */
 100                if (vma->vm_file || vma->vm_flags & VM_SHARED) {
 101                        error = -EINVAL;
 102                        goto out;
 103                }
 104                new_flags |= VM_WIPEONFORK;
 105                break;
 106        case MADV_KEEPONFORK:
 107                new_flags &= ~VM_WIPEONFORK;
 108                break;
 109        case MADV_DONTDUMP:
 110                new_flags |= VM_DONTDUMP;
 111                break;
 112        case MADV_DODUMP:
 113                if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) {
 114                        error = -EINVAL;
 115                        goto out;
 116                }
 117                new_flags &= ~VM_DONTDUMP;
 118                break;
 119        case MADV_MERGEABLE:
 120        case MADV_UNMERGEABLE:
 121                error = ksm_madvise(vma, start, end, behavior, &new_flags);
 122                if (error)
 123                        goto out_convert_errno;
 124                break;
 125        case MADV_HUGEPAGE:
 126        case MADV_NOHUGEPAGE:
 127                error = hugepage_madvise(vma, &new_flags, behavior);
 128                if (error)
 129                        goto out_convert_errno;
 130                break;
 131        }
 132
 133        if (new_flags == vma->vm_flags) {
 134                *prev = vma;
 135                goto out;
 136        }
 137
 138        pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
 139        *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
 140                          vma->vm_file, pgoff, vma_policy(vma),
 141                          vma->vm_userfaultfd_ctx);
 142        if (*prev) {
 143                vma = *prev;
 144                goto success;
 145        }
 146
 147        *prev = vma;
 148
 149        if (start != vma->vm_start) {
 150                if (unlikely(mm->map_count >= sysctl_max_map_count)) {
 151                        error = -ENOMEM;
 152                        goto out;
 153                }
 154                error = __split_vma(mm, vma, start, 1);
 155                if (error)
 156                        goto out_convert_errno;
 157        }
 158
 159        if (end != vma->vm_end) {
 160                if (unlikely(mm->map_count >= sysctl_max_map_count)) {
 161                        error = -ENOMEM;
 162                        goto out;
 163                }
 164                error = __split_vma(mm, vma, end, 0);
 165                if (error)
 166                        goto out_convert_errno;
 167        }
 168
 169success:
 170        /*
 171         * vm_flags is protected by the mmap_lock held in write mode.
 172         */
 173        vma->vm_flags = new_flags;
 174
 175out_convert_errno:
 176        /*
 177         * madvise() returns EAGAIN if kernel resources, such as
 178         * slab, are temporarily unavailable.
 179         */
 180        if (error == -ENOMEM)
 181                error = -EAGAIN;
 182out:
 183        return error;
 184}
 185
 186#ifdef CONFIG_SWAP
 187static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
 188        unsigned long end, struct mm_walk *walk)
 189{
 190        pte_t *orig_pte;
 191        struct vm_area_struct *vma = walk->private;
 192        unsigned long index;
 193
 194        if (pmd_none_or_trans_huge_or_clear_bad(pmd))
 195                return 0;
 196
 197        for (index = start; index != end; index += PAGE_SIZE) {
 198                pte_t pte;
 199                swp_entry_t entry;
 200                struct page *page;
 201                spinlock_t *ptl;
 202
 203                orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
 204                pte = *(orig_pte + ((index - start) / PAGE_SIZE));
 205                pte_unmap_unlock(orig_pte, ptl);
 206
 207                if (pte_present(pte) || pte_none(pte))
 208                        continue;
 209                entry = pte_to_swp_entry(pte);
 210                if (unlikely(non_swap_entry(entry)))
 211                        continue;
 212
 213                page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
 214                                                        vma, index, false);
 215                if (page)
 216                        put_page(page);
 217        }
 218
 219        return 0;
 220}
 221
 222static const struct mm_walk_ops swapin_walk_ops = {
 223        .pmd_entry              = swapin_walk_pmd_entry,
 224};
 225
 226static void force_shm_swapin_readahead(struct vm_area_struct *vma,
 227                unsigned long start, unsigned long end,
 228                struct address_space *mapping)
 229{
 230        XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start));
 231        pgoff_t end_index = linear_page_index(vma, end + PAGE_SIZE - 1);
 232        struct page *page;
 233
 234        rcu_read_lock();
 235        xas_for_each(&xas, page, end_index) {
 236                swp_entry_t swap;
 237
 238                if (!xa_is_value(page))
 239                        continue;
 240                xas_pause(&xas);
 241                rcu_read_unlock();
 242
 243                swap = radix_to_swp_entry(page);
 244                page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
 245                                                        NULL, 0, false);
 246                if (page)
 247                        put_page(page);
 248
 249                rcu_read_lock();
 250        }
 251        rcu_read_unlock();
 252
 253        lru_add_drain();        /* Push any new pages onto the LRU now */
 254}
 255#endif          /* CONFIG_SWAP */
 256
 257/*
 258 * Schedule all required I/O operations.  Do not wait for completion.
 259 */
 260static long madvise_willneed(struct vm_area_struct *vma,
 261                             struct vm_area_struct **prev,
 262                             unsigned long start, unsigned long end)
 263{
 264        struct mm_struct *mm = vma->vm_mm;
 265        struct file *file = vma->vm_file;
 266        loff_t offset;
 267
 268        *prev = vma;
 269#ifdef CONFIG_SWAP
 270        if (!file) {
 271                walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma);
 272                lru_add_drain(); /* Push any new pages onto the LRU now */
 273                return 0;
 274        }
 275
 276        if (shmem_mapping(file->f_mapping)) {
 277                force_shm_swapin_readahead(vma, start, end,
 278                                        file->f_mapping);
 279                return 0;
 280        }
 281#else
 282        if (!file)
 283                return -EBADF;
 284#endif
 285
 286        if (IS_DAX(file_inode(file))) {
 287                /* no bad return value, but ignore advice */
 288                return 0;
 289        }
 290
 291        /*
 292         * Filesystem's fadvise may need to take various locks.  We need to
 293         * explicitly grab a reference because the vma (and hence the
 294         * vma's reference to the file) can go away as soon as we drop
 295         * mmap_lock.
 296         */
 297        *prev = NULL;   /* tell sys_madvise we drop mmap_lock */
 298        get_file(file);
 299        offset = (loff_t)(start - vma->vm_start)
 300                        + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
 301        mmap_read_unlock(mm);
 302        vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
 303        fput(file);
 304        mmap_read_lock(mm);
 305        return 0;
 306}
 307
 308static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 309                                unsigned long addr, unsigned long end,
 310                                struct mm_walk *walk)
 311{
 312        struct madvise_walk_private *private = walk->private;
 313        struct mmu_gather *tlb = private->tlb;
 314        bool pageout = private->pageout;
 315        struct mm_struct *mm = tlb->mm;
 316        struct vm_area_struct *vma = walk->vma;
 317        pte_t *orig_pte, *pte, ptent;
 318        spinlock_t *ptl;
 319        struct page *page = NULL;
 320        LIST_HEAD(page_list);
 321
 322        if (fatal_signal_pending(current))
 323                return -EINTR;
 324
 325#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 326        if (pmd_trans_huge(*pmd)) {
 327                pmd_t orig_pmd;
 328                unsigned long next = pmd_addr_end(addr, end);
 329
 330                tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
 331                ptl = pmd_trans_huge_lock(pmd, vma);
 332                if (!ptl)
 333                        return 0;
 334
 335                orig_pmd = *pmd;
 336                if (is_huge_zero_pmd(orig_pmd))
 337                        goto huge_unlock;
 338
 339                if (unlikely(!pmd_present(orig_pmd))) {
 340                        VM_BUG_ON(thp_migration_supported() &&
 341                                        !is_pmd_migration_entry(orig_pmd));
 342                        goto huge_unlock;
 343                }
 344
 345                page = pmd_page(orig_pmd);
 346
 347                /* Do not interfere with other mappings of this page */
 348                if (page_mapcount(page) != 1)
 349                        goto huge_unlock;
 350
 351                if (next - addr != HPAGE_PMD_SIZE) {
 352                        int err;
 353
 354                        get_page(page);
 355                        spin_unlock(ptl);
 356                        lock_page(page);
 357                        err = split_huge_page(page);
 358                        unlock_page(page);
 359                        put_page(page);
 360                        if (!err)
 361                                goto regular_page;
 362                        return 0;
 363                }
 364
 365                if (pmd_young(orig_pmd)) {
 366                        pmdp_invalidate(vma, addr, pmd);
 367                        orig_pmd = pmd_mkold(orig_pmd);
 368
 369                        set_pmd_at(mm, addr, pmd, orig_pmd);
 370                        tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
 371                }
 372
 373                ClearPageReferenced(page);
 374                test_and_clear_page_young(page);
 375                if (pageout) {
 376                        if (!isolate_lru_page(page)) {
 377                                if (PageUnevictable(page))
 378                                        putback_lru_page(page);
 379                                else
 380                                        list_add(&page->lru, &page_list);
 381                        }
 382                } else
 383                        deactivate_page(page);
 384huge_unlock:
 385                spin_unlock(ptl);
 386                if (pageout)
 387                        reclaim_pages(&page_list);
 388                return 0;
 389        }
 390
 391regular_page:
 392        if (pmd_trans_unstable(pmd))
 393                return 0;
 394#endif
 395        tlb_change_page_size(tlb, PAGE_SIZE);
 396        orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 397        flush_tlb_batched_pending(mm);
 398        arch_enter_lazy_mmu_mode();
 399        for (; addr < end; pte++, addr += PAGE_SIZE) {
 400                ptent = *pte;
 401
 402                if (pte_none(ptent))
 403                        continue;
 404
 405                if (!pte_present(ptent))
 406                        continue;
 407
 408                page = vm_normal_page(vma, addr, ptent);
 409                if (!page)
 410                        continue;
 411
 412                /*
 413                 * Creating a THP page is expensive so split it only if we
 414                 * are sure it's worth. Split it if we are only owner.
 415                 */
 416                if (PageTransCompound(page)) {
 417                        if (page_mapcount(page) != 1)
 418                                break;
 419                        get_page(page);
 420                        if (!trylock_page(page)) {
 421                                put_page(page);
 422                                break;
 423                        }
 424                        pte_unmap_unlock(orig_pte, ptl);
 425                        if (split_huge_page(page)) {
 426                                unlock_page(page);
 427                                put_page(page);
 428                                pte_offset_map_lock(mm, pmd, addr, &ptl);
 429                                break;
 430                        }
 431                        unlock_page(page);
 432                        put_page(page);
 433                        pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 434                        pte--;
 435                        addr -= PAGE_SIZE;
 436                        continue;
 437                }
 438
 439                /* Do not interfere with other mappings of this page */
 440                if (page_mapcount(page) != 1)
 441                        continue;
 442
 443                VM_BUG_ON_PAGE(PageTransCompound(page), page);
 444
 445                if (pte_young(ptent)) {
 446                        ptent = ptep_get_and_clear_full(mm, addr, pte,
 447                                                        tlb->fullmm);
 448                        ptent = pte_mkold(ptent);
 449                        set_pte_at(mm, addr, pte, ptent);
 450                        tlb_remove_tlb_entry(tlb, pte, addr);
 451                }
 452
 453                /*
 454                 * We are deactivating a page for accelerating reclaiming.
 455                 * VM couldn't reclaim the page unless we clear PG_young.
 456                 * As a side effect, it makes confuse idle-page tracking
 457                 * because they will miss recent referenced history.
 458                 */
 459                ClearPageReferenced(page);
 460                test_and_clear_page_young(page);
 461                if (pageout) {
 462                        if (!isolate_lru_page(page)) {
 463                                if (PageUnevictable(page))
 464                                        putback_lru_page(page);
 465                                else
 466                                        list_add(&page->lru, &page_list);
 467                        }
 468                } else
 469                        deactivate_page(page);
 470        }
 471
 472        arch_leave_lazy_mmu_mode();
 473        pte_unmap_unlock(orig_pte, ptl);
 474        if (pageout)
 475                reclaim_pages(&page_list);
 476        cond_resched();
 477
 478        return 0;
 479}
 480
 481static const struct mm_walk_ops cold_walk_ops = {
 482        .pmd_entry = madvise_cold_or_pageout_pte_range,
 483};
 484
 485static void madvise_cold_page_range(struct mmu_gather *tlb,
 486                             struct vm_area_struct *vma,
 487                             unsigned long addr, unsigned long end)
 488{
 489        struct madvise_walk_private walk_private = {
 490                .pageout = false,
 491                .tlb = tlb,
 492        };
 493
 494        tlb_start_vma(tlb, vma);
 495        walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
 496        tlb_end_vma(tlb, vma);
 497}
 498
 499static long madvise_cold(struct vm_area_struct *vma,
 500                        struct vm_area_struct **prev,
 501                        unsigned long start_addr, unsigned long end_addr)
 502{
 503        struct mm_struct *mm = vma->vm_mm;
 504        struct mmu_gather tlb;
 505
 506        *prev = vma;
 507        if (!can_madv_lru_vma(vma))
 508                return -EINVAL;
 509
 510        lru_add_drain();
 511        tlb_gather_mmu(&tlb, mm);
 512        madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
 513        tlb_finish_mmu(&tlb);
 514
 515        return 0;
 516}
 517
 518static void madvise_pageout_page_range(struct mmu_gather *tlb,
 519                             struct vm_area_struct *vma,
 520                             unsigned long addr, unsigned long end)
 521{
 522        struct madvise_walk_private walk_private = {
 523                .pageout = true,
 524                .tlb = tlb,
 525        };
 526
 527        tlb_start_vma(tlb, vma);
 528        walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
 529        tlb_end_vma(tlb, vma);
 530}
 531
 532static inline bool can_do_pageout(struct vm_area_struct *vma)
 533{
 534        if (vma_is_anonymous(vma))
 535                return true;
 536        if (!vma->vm_file)
 537                return false;
 538        /*
 539         * paging out pagecache only for non-anonymous mappings that correspond
 540         * to the files the calling process could (if tried) open for writing;
 541         * otherwise we'd be including shared non-exclusive mappings, which
 542         * opens a side channel.
 543         */
 544        return inode_owner_or_capable(&init_user_ns,
 545                                      file_inode(vma->vm_file)) ||
 546               file_permission(vma->vm_file, MAY_WRITE) == 0;
 547}
 548
 549static long madvise_pageout(struct vm_area_struct *vma,
 550                        struct vm_area_struct **prev,
 551                        unsigned long start_addr, unsigned long end_addr)
 552{
 553        struct mm_struct *mm = vma->vm_mm;
 554        struct mmu_gather tlb;
 555
 556        *prev = vma;
 557        if (!can_madv_lru_vma(vma))
 558                return -EINVAL;
 559
 560        if (!can_do_pageout(vma))
 561                return 0;
 562
 563        lru_add_drain();
 564        tlb_gather_mmu(&tlb, mm);
 565        madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
 566        tlb_finish_mmu(&tlb);
 567
 568        return 0;
 569}
 570
 571static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 572                                unsigned long end, struct mm_walk *walk)
 573
 574{
 575        struct mmu_gather *tlb = walk->private;
 576        struct mm_struct *mm = tlb->mm;
 577        struct vm_area_struct *vma = walk->vma;
 578        spinlock_t *ptl;
 579        pte_t *orig_pte, *pte, ptent;
 580        struct page *page;
 581        int nr_swap = 0;
 582        unsigned long next;
 583
 584        next = pmd_addr_end(addr, end);
 585        if (pmd_trans_huge(*pmd))
 586                if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
 587                        goto next;
 588
 589        if (pmd_trans_unstable(pmd))
 590                return 0;
 591
 592        tlb_change_page_size(tlb, PAGE_SIZE);
 593        orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 594        flush_tlb_batched_pending(mm);
 595        arch_enter_lazy_mmu_mode();
 596        for (; addr != end; pte++, addr += PAGE_SIZE) {
 597                ptent = *pte;
 598
 599                if (pte_none(ptent))
 600                        continue;
 601                /*
 602                 * If the pte has swp_entry, just clear page table to
 603                 * prevent swap-in which is more expensive rather than
 604                 * (page allocation + zeroing).
 605                 */
 606                if (!pte_present(ptent)) {
 607                        swp_entry_t entry;
 608
 609                        entry = pte_to_swp_entry(ptent);
 610                        if (non_swap_entry(entry))
 611                                continue;
 612                        nr_swap--;
 613                        free_swap_and_cache(entry);
 614                        pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
 615                        continue;
 616                }
 617
 618                page = vm_normal_page(vma, addr, ptent);
 619                if (!page)
 620                        continue;
 621
 622                /*
 623                 * If pmd isn't transhuge but the page is THP and
 624                 * is owned by only this process, split it and
 625                 * deactivate all pages.
 626                 */
 627                if (PageTransCompound(page)) {
 628                        if (page_mapcount(page) != 1)
 629                                goto out;
 630                        get_page(page);
 631                        if (!trylock_page(page)) {
 632                                put_page(page);
 633                                goto out;
 634                        }
 635                        pte_unmap_unlock(orig_pte, ptl);
 636                        if (split_huge_page(page)) {
 637                                unlock_page(page);
 638                                put_page(page);
 639                                pte_offset_map_lock(mm, pmd, addr, &ptl);
 640                                goto out;
 641                        }
 642                        unlock_page(page);
 643                        put_page(page);
 644                        pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 645                        pte--;
 646                        addr -= PAGE_SIZE;
 647                        continue;
 648                }
 649
 650                VM_BUG_ON_PAGE(PageTransCompound(page), page);
 651
 652                if (PageSwapCache(page) || PageDirty(page)) {
 653                        if (!trylock_page(page))
 654                                continue;
 655                        /*
 656                         * If page is shared with others, we couldn't clear
 657                         * PG_dirty of the page.
 658                         */
 659                        if (page_mapcount(page) != 1) {
 660                                unlock_page(page);
 661                                continue;
 662                        }
 663
 664                        if (PageSwapCache(page) && !try_to_free_swap(page)) {
 665                                unlock_page(page);
 666                                continue;
 667                        }
 668
 669                        ClearPageDirty(page);
 670                        unlock_page(page);
 671                }
 672
 673                if (pte_young(ptent) || pte_dirty(ptent)) {
 674                        /*
 675                         * Some of architecture(ex, PPC) don't update TLB
 676                         * with set_pte_at and tlb_remove_tlb_entry so for
 677                         * the portability, remap the pte with old|clean
 678                         * after pte clearing.
 679                         */
 680                        ptent = ptep_get_and_clear_full(mm, addr, pte,
 681                                                        tlb->fullmm);
 682
 683                        ptent = pte_mkold(ptent);
 684                        ptent = pte_mkclean(ptent);
 685                        set_pte_at(mm, addr, pte, ptent);
 686                        tlb_remove_tlb_entry(tlb, pte, addr);
 687                }
 688                mark_page_lazyfree(page);
 689        }
 690out:
 691        if (nr_swap) {
 692                if (current->mm == mm)
 693                        sync_mm_rss(mm);
 694
 695                add_mm_counter(mm, MM_SWAPENTS, nr_swap);
 696        }
 697        arch_leave_lazy_mmu_mode();
 698        pte_unmap_unlock(orig_pte, ptl);
 699        cond_resched();
 700next:
 701        return 0;
 702}
 703
 704static const struct mm_walk_ops madvise_free_walk_ops = {
 705        .pmd_entry              = madvise_free_pte_range,
 706};
 707
 708static int madvise_free_single_vma(struct vm_area_struct *vma,
 709                        unsigned long start_addr, unsigned long end_addr)
 710{
 711        struct mm_struct *mm = vma->vm_mm;
 712        struct mmu_notifier_range range;
 713        struct mmu_gather tlb;
 714
 715        /* MADV_FREE works for only anon vma at the moment */
 716        if (!vma_is_anonymous(vma))
 717                return -EINVAL;
 718
 719        range.start = max(vma->vm_start, start_addr);
 720        if (range.start >= vma->vm_end)
 721                return -EINVAL;
 722        range.end = min(vma->vm_end, end_addr);
 723        if (range.end <= vma->vm_start)
 724                return -EINVAL;
 725        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
 726                                range.start, range.end);
 727
 728        lru_add_drain();
 729        tlb_gather_mmu(&tlb, mm);
 730        update_hiwater_rss(mm);
 731
 732        mmu_notifier_invalidate_range_start(&range);
 733        tlb_start_vma(&tlb, vma);
 734        walk_page_range(vma->vm_mm, range.start, range.end,
 735                        &madvise_free_walk_ops, &tlb);
 736        tlb_end_vma(&tlb, vma);
 737        mmu_notifier_invalidate_range_end(&range);
 738        tlb_finish_mmu(&tlb);
 739
 740        return 0;
 741}
 742
 743/*
 744 * Application no longer needs these pages.  If the pages are dirty,
 745 * it's OK to just throw them away.  The app will be more careful about
 746 * data it wants to keep.  Be sure to free swap resources too.  The
 747 * zap_page_range call sets things up for shrink_active_list to actually free
 748 * these pages later if no one else has touched them in the meantime,
 749 * although we could add these pages to a global reuse list for
 750 * shrink_active_list to pick up before reclaiming other pages.
 751 *
 752 * NB: This interface discards data rather than pushes it out to swap,
 753 * as some implementations do.  This has performance implications for
 754 * applications like large transactional databases which want to discard
 755 * pages in anonymous maps after committing to backing store the data
 756 * that was kept in them.  There is no reason to write this data out to
 757 * the swap area if the application is discarding it.
 758 *
 759 * An interface that causes the system to free clean pages and flush
 760 * dirty pages is already available as msync(MS_INVALIDATE).
 761 */
 762static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
 763                                        unsigned long start, unsigned long end)
 764{
 765        zap_page_range(vma, start, end - start);
 766        return 0;
 767}
 768
 769static long madvise_dontneed_free(struct vm_area_struct *vma,
 770                                  struct vm_area_struct **prev,
 771                                  unsigned long start, unsigned long end,
 772                                  int behavior)
 773{
 774        struct mm_struct *mm = vma->vm_mm;
 775
 776        *prev = vma;
 777        if (!can_madv_lru_vma(vma))
 778                return -EINVAL;
 779
 780        if (!userfaultfd_remove(vma, start, end)) {
 781                *prev = NULL; /* mmap_lock has been dropped, prev is stale */
 782
 783                mmap_read_lock(mm);
 784                vma = find_vma(mm, start);
 785                if (!vma)
 786                        return -ENOMEM;
 787                if (start < vma->vm_start) {
 788                        /*
 789                         * This "vma" under revalidation is the one
 790                         * with the lowest vma->vm_start where start
 791                         * is also < vma->vm_end. If start <
 792                         * vma->vm_start it means an hole materialized
 793                         * in the user address space within the
 794                         * virtual range passed to MADV_DONTNEED
 795                         * or MADV_FREE.
 796                         */
 797                        return -ENOMEM;
 798                }
 799                if (!can_madv_lru_vma(vma))
 800                        return -EINVAL;
 801                if (end > vma->vm_end) {
 802                        /*
 803                         * Don't fail if end > vma->vm_end. If the old
 804                         * vma was split while the mmap_lock was
 805                         * released the effect of the concurrent
 806                         * operation may not cause madvise() to
 807                         * have an undefined result. There may be an
 808                         * adjacent next vma that we'll walk
 809                         * next. userfaultfd_remove() will generate an
 810                         * UFFD_EVENT_REMOVE repetition on the
 811                         * end-vma->vm_end range, but the manager can
 812                         * handle a repetition fine.
 813                         */
 814                        end = vma->vm_end;
 815                }
 816                VM_WARN_ON(start >= end);
 817        }
 818
 819        if (behavior == MADV_DONTNEED)
 820                return madvise_dontneed_single_vma(vma, start, end);
 821        else if (behavior == MADV_FREE)
 822                return madvise_free_single_vma(vma, start, end);
 823        else
 824                return -EINVAL;
 825}
 826
 827static long madvise_populate(struct vm_area_struct *vma,
 828                             struct vm_area_struct **prev,
 829                             unsigned long start, unsigned long end,
 830                             int behavior)
 831{
 832        const bool write = behavior == MADV_POPULATE_WRITE;
 833        struct mm_struct *mm = vma->vm_mm;
 834        unsigned long tmp_end;
 835        int locked = 1;
 836        long pages;
 837
 838        *prev = vma;
 839
 840        while (start < end) {
 841                /*
 842                 * We might have temporarily dropped the lock. For example,
 843                 * our VMA might have been split.
 844                 */
 845                if (!vma || start >= vma->vm_end) {
 846                        vma = find_vma(mm, start);
 847                        if (!vma || start < vma->vm_start)
 848                                return -ENOMEM;
 849                }
 850
 851                tmp_end = min_t(unsigned long, end, vma->vm_end);
 852                /* Populate (prefault) page tables readable/writable. */
 853                pages = faultin_vma_page_range(vma, start, tmp_end, write,
 854                                               &locked);
 855                if (!locked) {
 856                        mmap_read_lock(mm);
 857                        locked = 1;
 858                        *prev = NULL;
 859                        vma = NULL;
 860                }
 861                if (pages < 0) {
 862                        switch (pages) {
 863                        case -EINTR:
 864                                return -EINTR;
 865                        case -EINVAL: /* Incompatible mappings / permissions. */
 866                                return -EINVAL;
 867                        case -EHWPOISON:
 868                                return -EHWPOISON;
 869                        case -EFAULT: /* VM_FAULT_SIGBUS or VM_FAULT_SIGSEGV */
 870                                return -EFAULT;
 871                        default:
 872                                pr_warn_once("%s: unhandled return value: %ld\n",
 873                                             __func__, pages);
 874                                fallthrough;
 875                        case -ENOMEM:
 876                                return -ENOMEM;
 877                        }
 878                }
 879                start += pages * PAGE_SIZE;
 880        }
 881        return 0;
 882}
 883
 884/*
 885 * Application wants to free up the pages and associated backing store.
 886 * This is effectively punching a hole into the middle of a file.
 887 */
 888static long madvise_remove(struct vm_area_struct *vma,
 889                                struct vm_area_struct **prev,
 890                                unsigned long start, unsigned long end)
 891{
 892        loff_t offset;
 893        int error;
 894        struct file *f;
 895        struct mm_struct *mm = vma->vm_mm;
 896
 897        *prev = NULL;   /* tell sys_madvise we drop mmap_lock */
 898
 899        if (vma->vm_flags & VM_LOCKED)
 900                return -EINVAL;
 901
 902        f = vma->vm_file;
 903
 904        if (!f || !f->f_mapping || !f->f_mapping->host) {
 905                        return -EINVAL;
 906        }
 907
 908        if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
 909                return -EACCES;
 910
 911        offset = (loff_t)(start - vma->vm_start)
 912                        + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
 913
 914        /*
 915         * Filesystem's fallocate may need to take i_rwsem.  We need to
 916         * explicitly grab a reference because the vma (and hence the
 917         * vma's reference to the file) can go away as soon as we drop
 918         * mmap_lock.
 919         */
 920        get_file(f);
 921        if (userfaultfd_remove(vma, start, end)) {
 922                /* mmap_lock was not released by userfaultfd_remove() */
 923                mmap_read_unlock(mm);
 924        }
 925        error = vfs_fallocate(f,
 926                                FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
 927                                offset, end - start);
 928        fput(f);
 929        mmap_read_lock(mm);
 930        return error;
 931}
 932
 933#ifdef CONFIG_MEMORY_FAILURE
 934/*
 935 * Error injection support for memory error handling.
 936 */
 937static int madvise_inject_error(int behavior,
 938                unsigned long start, unsigned long end)
 939{
 940        unsigned long size;
 941
 942        if (!capable(CAP_SYS_ADMIN))
 943                return -EPERM;
 944
 945
 946        for (; start < end; start += size) {
 947                unsigned long pfn;
 948                struct page *page;
 949                int ret;
 950
 951                ret = get_user_pages_fast(start, 1, 0, &page);
 952                if (ret != 1)
 953                        return ret;
 954                pfn = page_to_pfn(page);
 955
 956                /*
 957                 * When soft offlining hugepages, after migrating the page
 958                 * we dissolve it, therefore in the second loop "page" will
 959                 * no longer be a compound page.
 960                 */
 961                size = page_size(compound_head(page));
 962
 963                if (behavior == MADV_SOFT_OFFLINE) {
 964                        pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
 965                                 pfn, start);
 966                        ret = soft_offline_page(pfn, MF_COUNT_INCREASED);
 967                } else {
 968                        pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
 969                                 pfn, start);
 970                        ret = memory_failure(pfn, MF_COUNT_INCREASED);
 971                }
 972
 973                if (ret)
 974                        return ret;
 975        }
 976
 977        return 0;
 978}
 979#endif
 980
 981static long
 982madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
 983                unsigned long start, unsigned long end, int behavior)
 984{
 985        switch (behavior) {
 986        case MADV_REMOVE:
 987                return madvise_remove(vma, prev, start, end);
 988        case MADV_WILLNEED:
 989                return madvise_willneed(vma, prev, start, end);
 990        case MADV_COLD:
 991                return madvise_cold(vma, prev, start, end);
 992        case MADV_PAGEOUT:
 993                return madvise_pageout(vma, prev, start, end);
 994        case MADV_FREE:
 995        case MADV_DONTNEED:
 996                return madvise_dontneed_free(vma, prev, start, end, behavior);
 997        case MADV_POPULATE_READ:
 998        case MADV_POPULATE_WRITE:
 999                return madvise_populate(vma, prev, start, end, behavior);
1000        default:
1001                return madvise_behavior(vma, prev, start, end, behavior);
1002        }
1003}
1004
1005static bool
1006madvise_behavior_valid(int behavior)
1007{
1008        switch (behavior) {
1009        case MADV_DOFORK:
1010        case MADV_DONTFORK:
1011        case MADV_NORMAL:
1012        case MADV_SEQUENTIAL:
1013        case MADV_RANDOM:
1014        case MADV_REMOVE:
1015        case MADV_WILLNEED:
1016        case MADV_DONTNEED:
1017        case MADV_FREE:
1018        case MADV_COLD:
1019        case MADV_PAGEOUT:
1020        case MADV_POPULATE_READ:
1021        case MADV_POPULATE_WRITE:
1022#ifdef CONFIG_KSM
1023        case MADV_MERGEABLE:
1024        case MADV_UNMERGEABLE:
1025#endif
1026#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1027        case MADV_HUGEPAGE:
1028        case MADV_NOHUGEPAGE:
1029#endif
1030        case MADV_DONTDUMP:
1031        case MADV_DODUMP:
1032        case MADV_WIPEONFORK:
1033        case MADV_KEEPONFORK:
1034#ifdef CONFIG_MEMORY_FAILURE
1035        case MADV_SOFT_OFFLINE:
1036        case MADV_HWPOISON:
1037#endif
1038                return true;
1039
1040        default:
1041                return false;
1042        }
1043}
1044
1045static bool
1046process_madvise_behavior_valid(int behavior)
1047{
1048        switch (behavior) {
1049        case MADV_COLD:
1050        case MADV_PAGEOUT:
1051        case MADV_WILLNEED:
1052                return true;
1053        default:
1054                return false;
1055        }
1056}
1057
1058/*
1059 * The madvise(2) system call.
1060 *
1061 * Applications can use madvise() to advise the kernel how it should
1062 * handle paging I/O in this VM area.  The idea is to help the kernel
1063 * use appropriate read-ahead and caching techniques.  The information
1064 * provided is advisory only, and can be safely disregarded by the
1065 * kernel without affecting the correct operation of the application.
1066 *
1067 * behavior values:
1068 *  MADV_NORMAL - the default behavior is to read clusters.  This
1069 *              results in some read-ahead and read-behind.
1070 *  MADV_RANDOM - the system should read the minimum amount of data
1071 *              on any access, since it is unlikely that the appli-
1072 *              cation will need more than what it asks for.
1073 *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
1074 *              once, so they can be aggressively read ahead, and
1075 *              can be freed soon after they are accessed.
1076 *  MADV_WILLNEED - the application is notifying the system to read
1077 *              some pages ahead.
1078 *  MADV_DONTNEED - the application is finished with the given range,
1079 *              so the kernel can free resources associated with it.
1080 *  MADV_FREE - the application marks pages in the given range as lazy free,
1081 *              where actual purges are postponed until memory pressure happens.
1082 *  MADV_REMOVE - the application wants to free up the given range of
1083 *              pages and associated backing store.
1084 *  MADV_DONTFORK - omit this area from child's address space when forking:
1085 *              typically, to avoid COWing pages pinned by get_user_pages().
1086 *  MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
1087 *  MADV_WIPEONFORK - present the child process with zero-filled memory in this
1088 *              range after a fork.
1089 *  MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK
1090 *  MADV_HWPOISON - trigger memory error handler as if the given memory range
1091 *              were corrupted by unrecoverable hardware memory failure.
1092 *  MADV_SOFT_OFFLINE - try to soft-offline the given range of memory.
1093 *  MADV_MERGEABLE - the application recommends that KSM try to merge pages in
1094 *              this area with pages of identical content from other such areas.
1095 *  MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
1096 *  MADV_HUGEPAGE - the application wants to back the given range by transparent
1097 *              huge pages in the future. Existing pages might be coalesced and
1098 *              new pages might be allocated as THP.
1099 *  MADV_NOHUGEPAGE - mark the given range as not worth being backed by
1100 *              transparent huge pages so the existing pages will not be
1101 *              coalesced into THP and new pages will not be allocated as THP.
1102 *  MADV_DONTDUMP - the application wants to prevent pages in the given range
1103 *              from being included in its core dump.
1104 *  MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
1105 *  MADV_COLD - the application is not expected to use this memory soon,
1106 *              deactivate pages in this range so that they can be reclaimed
1107 *              easily if memory pressure happens.
1108 *  MADV_PAGEOUT - the application is not expected to use this memory soon,
1109 *              page out the pages in this range immediately.
1110 *  MADV_POPULATE_READ - populate (prefault) page tables readable by
1111 *              triggering read faults if required
1112 *  MADV_POPULATE_WRITE - populate (prefault) page tables writable by
1113 *              triggering write faults if required
1114 *
1115 * return values:
1116 *  zero    - success
1117 *  -EINVAL - start + len < 0, start is not page-aligned,
1118 *              "behavior" is not a valid value, or application
1119 *              is attempting to release locked or shared pages,
1120 *              or the specified address range includes file, Huge TLB,
1121 *              MAP_SHARED or VMPFNMAP range.
1122 *  -ENOMEM - addresses in the specified range are not currently
1123 *              mapped, or are outside the AS of the process.
1124 *  -EIO    - an I/O error occurred while paging in data.
1125 *  -EBADF  - map exists, but area maps something that isn't a file.
1126 *  -EAGAIN - a kernel resource was temporarily unavailable.
1127 */
1128int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
1129{
1130        unsigned long end, tmp;
1131        struct vm_area_struct *vma, *prev;
1132        int unmapped_error = 0;
1133        int error = -EINVAL;
1134        int write;
1135        size_t len;
1136        struct blk_plug plug;
1137
1138        start = untagged_addr(start);
1139
1140        if (!madvise_behavior_valid(behavior))
1141                return error;
1142
1143        if (!PAGE_ALIGNED(start))
1144                return error;
1145        len = PAGE_ALIGN(len_in);
1146
1147        /* Check to see whether len was rounded up from small -ve to zero */
1148        if (len_in && !len)
1149                return error;
1150
1151        end = start + len;
1152        if (end < start)
1153                return error;
1154
1155        error = 0;
1156        if (end == start)
1157                return error;
1158
1159#ifdef CONFIG_MEMORY_FAILURE
1160        if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
1161                return madvise_inject_error(behavior, start, start + len_in);
1162#endif
1163
1164        write = madvise_need_mmap_write(behavior);
1165        if (write) {
1166                if (mmap_write_lock_killable(mm))
1167                        return -EINTR;
1168        } else {
1169                mmap_read_lock(mm);
1170        }
1171
1172        /*
1173         * If the interval [start,end) covers some unmapped address
1174         * ranges, just ignore them, but return -ENOMEM at the end.
1175         * - different from the way of handling in mlock etc.
1176         */
1177        vma = find_vma_prev(mm, start, &prev);
1178        if (vma && start > vma->vm_start)
1179                prev = vma;
1180
1181        blk_start_plug(&plug);
1182        for (;;) {
1183                /* Still start < end. */
1184                error = -ENOMEM;
1185                if (!vma)
1186                        goto out;
1187
1188                /* Here start < (end|vma->vm_end). */
1189                if (start < vma->vm_start) {
1190                        unmapped_error = -ENOMEM;
1191                        start = vma->vm_start;
1192                        if (start >= end)
1193                                goto out;
1194                }
1195
1196                /* Here vma->vm_start <= start < (end|vma->vm_end) */
1197                tmp = vma->vm_end;
1198                if (end < tmp)
1199                        tmp = end;
1200
1201                /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
1202                error = madvise_vma(vma, &prev, start, tmp, behavior);
1203                if (error)
1204                        goto out;
1205                start = tmp;
1206                if (prev && start < prev->vm_end)
1207                        start = prev->vm_end;
1208                error = unmapped_error;
1209                if (start >= end)
1210                        goto out;
1211                if (prev)
1212                        vma = prev->vm_next;
1213                else    /* madvise_remove dropped mmap_lock */
1214                        vma = find_vma(mm, start);
1215        }
1216out:
1217        blk_finish_plug(&plug);
1218        if (write)
1219                mmap_write_unlock(mm);
1220        else
1221                mmap_read_unlock(mm);
1222
1223        return error;
1224}
1225
1226SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
1227{
1228        return do_madvise(current->mm, start, len_in, behavior);
1229}
1230
1231SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
1232                size_t, vlen, int, behavior, unsigned int, flags)
1233{
1234        ssize_t ret;
1235        struct iovec iovstack[UIO_FASTIOV], iovec;
1236        struct iovec *iov = iovstack;
1237        struct iov_iter iter;
1238        struct task_struct *task;
1239        struct mm_struct *mm;
1240        size_t total_len;
1241        unsigned int f_flags;
1242
1243        if (flags != 0) {
1244                ret = -EINVAL;
1245                goto out;
1246        }
1247
1248        ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
1249        if (ret < 0)
1250                goto out;
1251
1252        task = pidfd_get_task(pidfd, &f_flags);
1253        if (IS_ERR(task)) {
1254                ret = PTR_ERR(task);
1255                goto free_iov;
1256        }
1257
1258        if (!process_madvise_behavior_valid(behavior)) {
1259                ret = -EINVAL;
1260                goto release_task;
1261        }
1262
1263        /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
1264        mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
1265        if (IS_ERR_OR_NULL(mm)) {
1266                ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
1267                goto release_task;
1268        }
1269
1270        /*
1271         * Require CAP_SYS_NICE for influencing process performance. Note that
1272         * only non-destructive hints are currently supported.
1273         */
1274        if (!capable(CAP_SYS_NICE)) {
1275                ret = -EPERM;
1276                goto release_mm;
1277        }
1278
1279        total_len = iov_iter_count(&iter);
1280
1281        while (iov_iter_count(&iter)) {
1282                iovec = iov_iter_iovec(&iter);
1283                ret = do_madvise(mm, (unsigned long)iovec.iov_base,
1284                                        iovec.iov_len, behavior);
1285                if (ret < 0)
1286                        break;
1287                iov_iter_advance(&iter, iovec.iov_len);
1288        }
1289
1290        if (ret == 0)
1291                ret = total_len - iov_iter_count(&iter);
1292
1293release_mm:
1294        mmput(mm);
1295release_task:
1296        put_task_struct(task);
1297free_iov:
1298        kfree(iov);
1299out:
1300        return ret;
1301}
1302