1
2
3
4
5
6
7
8
9#include <linux/mman.h>
10#include <linux/pagemap.h>
11#include <linux/syscalls.h>
12#include <linux/mempolicy.h>
13#include <linux/page-isolation.h>
14#include <linux/page_idle.h>
15#include <linux/userfaultfd_k.h>
16#include <linux/hugetlb.h>
17#include <linux/falloc.h>
18#include <linux/fadvise.h>
19#include <linux/sched.h>
20#include <linux/sched/mm.h>
21#include <linux/uio.h>
22#include <linux/ksm.h>
23#include <linux/fs.h>
24#include <linux/file.h>
25#include <linux/blkdev.h>
26#include <linux/backing-dev.h>
27#include <linux/pagewalk.h>
28#include <linux/swap.h>
29#include <linux/swapops.h>
30#include <linux/shmem_fs.h>
31#include <linux/mmu_notifier.h>
32
33#include <asm/tlb.h>
34
35#include "internal.h"
36
37struct madvise_walk_private {
38 struct mmu_gather *tlb;
39 bool pageout;
40};
41
42
43
44
45
46
47static int madvise_need_mmap_write(int behavior)
48{
49 switch (behavior) {
50 case MADV_REMOVE:
51 case MADV_WILLNEED:
52 case MADV_DONTNEED:
53 case MADV_COLD:
54 case MADV_PAGEOUT:
55 case MADV_FREE:
56 case MADV_POPULATE_READ:
57 case MADV_POPULATE_WRITE:
58 return 0;
59 default:
60
61 return 1;
62 }
63}
64
65
66
67
68
69static long madvise_behavior(struct vm_area_struct *vma,
70 struct vm_area_struct **prev,
71 unsigned long start, unsigned long end, int behavior)
72{
73 struct mm_struct *mm = vma->vm_mm;
74 int error = 0;
75 pgoff_t pgoff;
76 unsigned long new_flags = vma->vm_flags;
77
78 switch (behavior) {
79 case MADV_NORMAL:
80 new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
81 break;
82 case MADV_SEQUENTIAL:
83 new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
84 break;
85 case MADV_RANDOM:
86 new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
87 break;
88 case MADV_DONTFORK:
89 new_flags |= VM_DONTCOPY;
90 break;
91 case MADV_DOFORK:
92 if (vma->vm_flags & VM_IO) {
93 error = -EINVAL;
94 goto out;
95 }
96 new_flags &= ~VM_DONTCOPY;
97 break;
98 case MADV_WIPEONFORK:
99
100 if (vma->vm_file || vma->vm_flags & VM_SHARED) {
101 error = -EINVAL;
102 goto out;
103 }
104 new_flags |= VM_WIPEONFORK;
105 break;
106 case MADV_KEEPONFORK:
107 new_flags &= ~VM_WIPEONFORK;
108 break;
109 case MADV_DONTDUMP:
110 new_flags |= VM_DONTDUMP;
111 break;
112 case MADV_DODUMP:
113 if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) {
114 error = -EINVAL;
115 goto out;
116 }
117 new_flags &= ~VM_DONTDUMP;
118 break;
119 case MADV_MERGEABLE:
120 case MADV_UNMERGEABLE:
121 error = ksm_madvise(vma, start, end, behavior, &new_flags);
122 if (error)
123 goto out_convert_errno;
124 break;
125 case MADV_HUGEPAGE:
126 case MADV_NOHUGEPAGE:
127 error = hugepage_madvise(vma, &new_flags, behavior);
128 if (error)
129 goto out_convert_errno;
130 break;
131 }
132
133 if (new_flags == vma->vm_flags) {
134 *prev = vma;
135 goto out;
136 }
137
138 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
139 *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
140 vma->vm_file, pgoff, vma_policy(vma),
141 vma->vm_userfaultfd_ctx);
142 if (*prev) {
143 vma = *prev;
144 goto success;
145 }
146
147 *prev = vma;
148
149 if (start != vma->vm_start) {
150 if (unlikely(mm->map_count >= sysctl_max_map_count)) {
151 error = -ENOMEM;
152 goto out;
153 }
154 error = __split_vma(mm, vma, start, 1);
155 if (error)
156 goto out_convert_errno;
157 }
158
159 if (end != vma->vm_end) {
160 if (unlikely(mm->map_count >= sysctl_max_map_count)) {
161 error = -ENOMEM;
162 goto out;
163 }
164 error = __split_vma(mm, vma, end, 0);
165 if (error)
166 goto out_convert_errno;
167 }
168
169success:
170
171
172
173 vma->vm_flags = new_flags;
174
175out_convert_errno:
176
177
178
179
180 if (error == -ENOMEM)
181 error = -EAGAIN;
182out:
183 return error;
184}
185
186#ifdef CONFIG_SWAP
187static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
188 unsigned long end, struct mm_walk *walk)
189{
190 pte_t *orig_pte;
191 struct vm_area_struct *vma = walk->private;
192 unsigned long index;
193
194 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
195 return 0;
196
197 for (index = start; index != end; index += PAGE_SIZE) {
198 pte_t pte;
199 swp_entry_t entry;
200 struct page *page;
201 spinlock_t *ptl;
202
203 orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
204 pte = *(orig_pte + ((index - start) / PAGE_SIZE));
205 pte_unmap_unlock(orig_pte, ptl);
206
207 if (pte_present(pte) || pte_none(pte))
208 continue;
209 entry = pte_to_swp_entry(pte);
210 if (unlikely(non_swap_entry(entry)))
211 continue;
212
213 page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
214 vma, index, false);
215 if (page)
216 put_page(page);
217 }
218
219 return 0;
220}
221
222static const struct mm_walk_ops swapin_walk_ops = {
223 .pmd_entry = swapin_walk_pmd_entry,
224};
225
226static void force_shm_swapin_readahead(struct vm_area_struct *vma,
227 unsigned long start, unsigned long end,
228 struct address_space *mapping)
229{
230 XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start));
231 pgoff_t end_index = linear_page_index(vma, end + PAGE_SIZE - 1);
232 struct page *page;
233
234 rcu_read_lock();
235 xas_for_each(&xas, page, end_index) {
236 swp_entry_t swap;
237
238 if (!xa_is_value(page))
239 continue;
240 xas_pause(&xas);
241 rcu_read_unlock();
242
243 swap = radix_to_swp_entry(page);
244 page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
245 NULL, 0, false);
246 if (page)
247 put_page(page);
248
249 rcu_read_lock();
250 }
251 rcu_read_unlock();
252
253 lru_add_drain();
254}
255#endif
256
257
258
259
260static long madvise_willneed(struct vm_area_struct *vma,
261 struct vm_area_struct **prev,
262 unsigned long start, unsigned long end)
263{
264 struct mm_struct *mm = vma->vm_mm;
265 struct file *file = vma->vm_file;
266 loff_t offset;
267
268 *prev = vma;
269#ifdef CONFIG_SWAP
270 if (!file) {
271 walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma);
272 lru_add_drain();
273 return 0;
274 }
275
276 if (shmem_mapping(file->f_mapping)) {
277 force_shm_swapin_readahead(vma, start, end,
278 file->f_mapping);
279 return 0;
280 }
281#else
282 if (!file)
283 return -EBADF;
284#endif
285
286 if (IS_DAX(file_inode(file))) {
287
288 return 0;
289 }
290
291
292
293
294
295
296
297 *prev = NULL;
298 get_file(file);
299 offset = (loff_t)(start - vma->vm_start)
300 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
301 mmap_read_unlock(mm);
302 vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
303 fput(file);
304 mmap_read_lock(mm);
305 return 0;
306}
307
308static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
309 unsigned long addr, unsigned long end,
310 struct mm_walk *walk)
311{
312 struct madvise_walk_private *private = walk->private;
313 struct mmu_gather *tlb = private->tlb;
314 bool pageout = private->pageout;
315 struct mm_struct *mm = tlb->mm;
316 struct vm_area_struct *vma = walk->vma;
317 pte_t *orig_pte, *pte, ptent;
318 spinlock_t *ptl;
319 struct page *page = NULL;
320 LIST_HEAD(page_list);
321
322 if (fatal_signal_pending(current))
323 return -EINTR;
324
325#ifdef CONFIG_TRANSPARENT_HUGEPAGE
326 if (pmd_trans_huge(*pmd)) {
327 pmd_t orig_pmd;
328 unsigned long next = pmd_addr_end(addr, end);
329
330 tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
331 ptl = pmd_trans_huge_lock(pmd, vma);
332 if (!ptl)
333 return 0;
334
335 orig_pmd = *pmd;
336 if (is_huge_zero_pmd(orig_pmd))
337 goto huge_unlock;
338
339 if (unlikely(!pmd_present(orig_pmd))) {
340 VM_BUG_ON(thp_migration_supported() &&
341 !is_pmd_migration_entry(orig_pmd));
342 goto huge_unlock;
343 }
344
345 page = pmd_page(orig_pmd);
346
347
348 if (page_mapcount(page) != 1)
349 goto huge_unlock;
350
351 if (next - addr != HPAGE_PMD_SIZE) {
352 int err;
353
354 get_page(page);
355 spin_unlock(ptl);
356 lock_page(page);
357 err = split_huge_page(page);
358 unlock_page(page);
359 put_page(page);
360 if (!err)
361 goto regular_page;
362 return 0;
363 }
364
365 if (pmd_young(orig_pmd)) {
366 pmdp_invalidate(vma, addr, pmd);
367 orig_pmd = pmd_mkold(orig_pmd);
368
369 set_pmd_at(mm, addr, pmd, orig_pmd);
370 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
371 }
372
373 ClearPageReferenced(page);
374 test_and_clear_page_young(page);
375 if (pageout) {
376 if (!isolate_lru_page(page)) {
377 if (PageUnevictable(page))
378 putback_lru_page(page);
379 else
380 list_add(&page->lru, &page_list);
381 }
382 } else
383 deactivate_page(page);
384huge_unlock:
385 spin_unlock(ptl);
386 if (pageout)
387 reclaim_pages(&page_list);
388 return 0;
389 }
390
391regular_page:
392 if (pmd_trans_unstable(pmd))
393 return 0;
394#endif
395 tlb_change_page_size(tlb, PAGE_SIZE);
396 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
397 flush_tlb_batched_pending(mm);
398 arch_enter_lazy_mmu_mode();
399 for (; addr < end; pte++, addr += PAGE_SIZE) {
400 ptent = *pte;
401
402 if (pte_none(ptent))
403 continue;
404
405 if (!pte_present(ptent))
406 continue;
407
408 page = vm_normal_page(vma, addr, ptent);
409 if (!page)
410 continue;
411
412
413
414
415
416 if (PageTransCompound(page)) {
417 if (page_mapcount(page) != 1)
418 break;
419 get_page(page);
420 if (!trylock_page(page)) {
421 put_page(page);
422 break;
423 }
424 pte_unmap_unlock(orig_pte, ptl);
425 if (split_huge_page(page)) {
426 unlock_page(page);
427 put_page(page);
428 pte_offset_map_lock(mm, pmd, addr, &ptl);
429 break;
430 }
431 unlock_page(page);
432 put_page(page);
433 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
434 pte--;
435 addr -= PAGE_SIZE;
436 continue;
437 }
438
439
440 if (page_mapcount(page) != 1)
441 continue;
442
443 VM_BUG_ON_PAGE(PageTransCompound(page), page);
444
445 if (pte_young(ptent)) {
446 ptent = ptep_get_and_clear_full(mm, addr, pte,
447 tlb->fullmm);
448 ptent = pte_mkold(ptent);
449 set_pte_at(mm, addr, pte, ptent);
450 tlb_remove_tlb_entry(tlb, pte, addr);
451 }
452
453
454
455
456
457
458
459 ClearPageReferenced(page);
460 test_and_clear_page_young(page);
461 if (pageout) {
462 if (!isolate_lru_page(page)) {
463 if (PageUnevictable(page))
464 putback_lru_page(page);
465 else
466 list_add(&page->lru, &page_list);
467 }
468 } else
469 deactivate_page(page);
470 }
471
472 arch_leave_lazy_mmu_mode();
473 pte_unmap_unlock(orig_pte, ptl);
474 if (pageout)
475 reclaim_pages(&page_list);
476 cond_resched();
477
478 return 0;
479}
480
481static const struct mm_walk_ops cold_walk_ops = {
482 .pmd_entry = madvise_cold_or_pageout_pte_range,
483};
484
485static void madvise_cold_page_range(struct mmu_gather *tlb,
486 struct vm_area_struct *vma,
487 unsigned long addr, unsigned long end)
488{
489 struct madvise_walk_private walk_private = {
490 .pageout = false,
491 .tlb = tlb,
492 };
493
494 tlb_start_vma(tlb, vma);
495 walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
496 tlb_end_vma(tlb, vma);
497}
498
499static long madvise_cold(struct vm_area_struct *vma,
500 struct vm_area_struct **prev,
501 unsigned long start_addr, unsigned long end_addr)
502{
503 struct mm_struct *mm = vma->vm_mm;
504 struct mmu_gather tlb;
505
506 *prev = vma;
507 if (!can_madv_lru_vma(vma))
508 return -EINVAL;
509
510 lru_add_drain();
511 tlb_gather_mmu(&tlb, mm);
512 madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
513 tlb_finish_mmu(&tlb);
514
515 return 0;
516}
517
518static void madvise_pageout_page_range(struct mmu_gather *tlb,
519 struct vm_area_struct *vma,
520 unsigned long addr, unsigned long end)
521{
522 struct madvise_walk_private walk_private = {
523 .pageout = true,
524 .tlb = tlb,
525 };
526
527 tlb_start_vma(tlb, vma);
528 walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
529 tlb_end_vma(tlb, vma);
530}
531
532static inline bool can_do_pageout(struct vm_area_struct *vma)
533{
534 if (vma_is_anonymous(vma))
535 return true;
536 if (!vma->vm_file)
537 return false;
538
539
540
541
542
543
544 return inode_owner_or_capable(&init_user_ns,
545 file_inode(vma->vm_file)) ||
546 file_permission(vma->vm_file, MAY_WRITE) == 0;
547}
548
549static long madvise_pageout(struct vm_area_struct *vma,
550 struct vm_area_struct **prev,
551 unsigned long start_addr, unsigned long end_addr)
552{
553 struct mm_struct *mm = vma->vm_mm;
554 struct mmu_gather tlb;
555
556 *prev = vma;
557 if (!can_madv_lru_vma(vma))
558 return -EINVAL;
559
560 if (!can_do_pageout(vma))
561 return 0;
562
563 lru_add_drain();
564 tlb_gather_mmu(&tlb, mm);
565 madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
566 tlb_finish_mmu(&tlb);
567
568 return 0;
569}
570
571static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
572 unsigned long end, struct mm_walk *walk)
573
574{
575 struct mmu_gather *tlb = walk->private;
576 struct mm_struct *mm = tlb->mm;
577 struct vm_area_struct *vma = walk->vma;
578 spinlock_t *ptl;
579 pte_t *orig_pte, *pte, ptent;
580 struct page *page;
581 int nr_swap = 0;
582 unsigned long next;
583
584 next = pmd_addr_end(addr, end);
585 if (pmd_trans_huge(*pmd))
586 if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
587 goto next;
588
589 if (pmd_trans_unstable(pmd))
590 return 0;
591
592 tlb_change_page_size(tlb, PAGE_SIZE);
593 orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
594 flush_tlb_batched_pending(mm);
595 arch_enter_lazy_mmu_mode();
596 for (; addr != end; pte++, addr += PAGE_SIZE) {
597 ptent = *pte;
598
599 if (pte_none(ptent))
600 continue;
601
602
603
604
605
606 if (!pte_present(ptent)) {
607 swp_entry_t entry;
608
609 entry = pte_to_swp_entry(ptent);
610 if (non_swap_entry(entry))
611 continue;
612 nr_swap--;
613 free_swap_and_cache(entry);
614 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
615 continue;
616 }
617
618 page = vm_normal_page(vma, addr, ptent);
619 if (!page)
620 continue;
621
622
623
624
625
626
627 if (PageTransCompound(page)) {
628 if (page_mapcount(page) != 1)
629 goto out;
630 get_page(page);
631 if (!trylock_page(page)) {
632 put_page(page);
633 goto out;
634 }
635 pte_unmap_unlock(orig_pte, ptl);
636 if (split_huge_page(page)) {
637 unlock_page(page);
638 put_page(page);
639 pte_offset_map_lock(mm, pmd, addr, &ptl);
640 goto out;
641 }
642 unlock_page(page);
643 put_page(page);
644 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
645 pte--;
646 addr -= PAGE_SIZE;
647 continue;
648 }
649
650 VM_BUG_ON_PAGE(PageTransCompound(page), page);
651
652 if (PageSwapCache(page) || PageDirty(page)) {
653 if (!trylock_page(page))
654 continue;
655
656
657
658
659 if (page_mapcount(page) != 1) {
660 unlock_page(page);
661 continue;
662 }
663
664 if (PageSwapCache(page) && !try_to_free_swap(page)) {
665 unlock_page(page);
666 continue;
667 }
668
669 ClearPageDirty(page);
670 unlock_page(page);
671 }
672
673 if (pte_young(ptent) || pte_dirty(ptent)) {
674
675
676
677
678
679
680 ptent = ptep_get_and_clear_full(mm, addr, pte,
681 tlb->fullmm);
682
683 ptent = pte_mkold(ptent);
684 ptent = pte_mkclean(ptent);
685 set_pte_at(mm, addr, pte, ptent);
686 tlb_remove_tlb_entry(tlb, pte, addr);
687 }
688 mark_page_lazyfree(page);
689 }
690out:
691 if (nr_swap) {
692 if (current->mm == mm)
693 sync_mm_rss(mm);
694
695 add_mm_counter(mm, MM_SWAPENTS, nr_swap);
696 }
697 arch_leave_lazy_mmu_mode();
698 pte_unmap_unlock(orig_pte, ptl);
699 cond_resched();
700next:
701 return 0;
702}
703
704static const struct mm_walk_ops madvise_free_walk_ops = {
705 .pmd_entry = madvise_free_pte_range,
706};
707
708static int madvise_free_single_vma(struct vm_area_struct *vma,
709 unsigned long start_addr, unsigned long end_addr)
710{
711 struct mm_struct *mm = vma->vm_mm;
712 struct mmu_notifier_range range;
713 struct mmu_gather tlb;
714
715
716 if (!vma_is_anonymous(vma))
717 return -EINVAL;
718
719 range.start = max(vma->vm_start, start_addr);
720 if (range.start >= vma->vm_end)
721 return -EINVAL;
722 range.end = min(vma->vm_end, end_addr);
723 if (range.end <= vma->vm_start)
724 return -EINVAL;
725 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
726 range.start, range.end);
727
728 lru_add_drain();
729 tlb_gather_mmu(&tlb, mm);
730 update_hiwater_rss(mm);
731
732 mmu_notifier_invalidate_range_start(&range);
733 tlb_start_vma(&tlb, vma);
734 walk_page_range(vma->vm_mm, range.start, range.end,
735 &madvise_free_walk_ops, &tlb);
736 tlb_end_vma(&tlb, vma);
737 mmu_notifier_invalidate_range_end(&range);
738 tlb_finish_mmu(&tlb);
739
740 return 0;
741}
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
763 unsigned long start, unsigned long end)
764{
765 zap_page_range(vma, start, end - start);
766 return 0;
767}
768
769static long madvise_dontneed_free(struct vm_area_struct *vma,
770 struct vm_area_struct **prev,
771 unsigned long start, unsigned long end,
772 int behavior)
773{
774 struct mm_struct *mm = vma->vm_mm;
775
776 *prev = vma;
777 if (!can_madv_lru_vma(vma))
778 return -EINVAL;
779
780 if (!userfaultfd_remove(vma, start, end)) {
781 *prev = NULL;
782
783 mmap_read_lock(mm);
784 vma = find_vma(mm, start);
785 if (!vma)
786 return -ENOMEM;
787 if (start < vma->vm_start) {
788
789
790
791
792
793
794
795
796
797 return -ENOMEM;
798 }
799 if (!can_madv_lru_vma(vma))
800 return -EINVAL;
801 if (end > vma->vm_end) {
802
803
804
805
806
807
808
809
810
811
812
813
814 end = vma->vm_end;
815 }
816 VM_WARN_ON(start >= end);
817 }
818
819 if (behavior == MADV_DONTNEED)
820 return madvise_dontneed_single_vma(vma, start, end);
821 else if (behavior == MADV_FREE)
822 return madvise_free_single_vma(vma, start, end);
823 else
824 return -EINVAL;
825}
826
827static long madvise_populate(struct vm_area_struct *vma,
828 struct vm_area_struct **prev,
829 unsigned long start, unsigned long end,
830 int behavior)
831{
832 const bool write = behavior == MADV_POPULATE_WRITE;
833 struct mm_struct *mm = vma->vm_mm;
834 unsigned long tmp_end;
835 int locked = 1;
836 long pages;
837
838 *prev = vma;
839
840 while (start < end) {
841
842
843
844
845 if (!vma || start >= vma->vm_end) {
846 vma = find_vma(mm, start);
847 if (!vma || start < vma->vm_start)
848 return -ENOMEM;
849 }
850
851 tmp_end = min_t(unsigned long, end, vma->vm_end);
852
853 pages = faultin_vma_page_range(vma, start, tmp_end, write,
854 &locked);
855 if (!locked) {
856 mmap_read_lock(mm);
857 locked = 1;
858 *prev = NULL;
859 vma = NULL;
860 }
861 if (pages < 0) {
862 switch (pages) {
863 case -EINTR:
864 return -EINTR;
865 case -EINVAL:
866 return -EINVAL;
867 case -EHWPOISON:
868 return -EHWPOISON;
869 case -EFAULT:
870 return -EFAULT;
871 default:
872 pr_warn_once("%s: unhandled return value: %ld\n",
873 __func__, pages);
874 fallthrough;
875 case -ENOMEM:
876 return -ENOMEM;
877 }
878 }
879 start += pages * PAGE_SIZE;
880 }
881 return 0;
882}
883
884
885
886
887
888static long madvise_remove(struct vm_area_struct *vma,
889 struct vm_area_struct **prev,
890 unsigned long start, unsigned long end)
891{
892 loff_t offset;
893 int error;
894 struct file *f;
895 struct mm_struct *mm = vma->vm_mm;
896
897 *prev = NULL;
898
899 if (vma->vm_flags & VM_LOCKED)
900 return -EINVAL;
901
902 f = vma->vm_file;
903
904 if (!f || !f->f_mapping || !f->f_mapping->host) {
905 return -EINVAL;
906 }
907
908 if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
909 return -EACCES;
910
911 offset = (loff_t)(start - vma->vm_start)
912 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
913
914
915
916
917
918
919
920 get_file(f);
921 if (userfaultfd_remove(vma, start, end)) {
922
923 mmap_read_unlock(mm);
924 }
925 error = vfs_fallocate(f,
926 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
927 offset, end - start);
928 fput(f);
929 mmap_read_lock(mm);
930 return error;
931}
932
933#ifdef CONFIG_MEMORY_FAILURE
934
935
936
937static int madvise_inject_error(int behavior,
938 unsigned long start, unsigned long end)
939{
940 unsigned long size;
941
942 if (!capable(CAP_SYS_ADMIN))
943 return -EPERM;
944
945
946 for (; start < end; start += size) {
947 unsigned long pfn;
948 struct page *page;
949 int ret;
950
951 ret = get_user_pages_fast(start, 1, 0, &page);
952 if (ret != 1)
953 return ret;
954 pfn = page_to_pfn(page);
955
956
957
958
959
960
961 size = page_size(compound_head(page));
962
963 if (behavior == MADV_SOFT_OFFLINE) {
964 pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
965 pfn, start);
966 ret = soft_offline_page(pfn, MF_COUNT_INCREASED);
967 } else {
968 pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
969 pfn, start);
970 ret = memory_failure(pfn, MF_COUNT_INCREASED);
971 }
972
973 if (ret)
974 return ret;
975 }
976
977 return 0;
978}
979#endif
980
981static long
982madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
983 unsigned long start, unsigned long end, int behavior)
984{
985 switch (behavior) {
986 case MADV_REMOVE:
987 return madvise_remove(vma, prev, start, end);
988 case MADV_WILLNEED:
989 return madvise_willneed(vma, prev, start, end);
990 case MADV_COLD:
991 return madvise_cold(vma, prev, start, end);
992 case MADV_PAGEOUT:
993 return madvise_pageout(vma, prev, start, end);
994 case MADV_FREE:
995 case MADV_DONTNEED:
996 return madvise_dontneed_free(vma, prev, start, end, behavior);
997 case MADV_POPULATE_READ:
998 case MADV_POPULATE_WRITE:
999 return madvise_populate(vma, prev, start, end, behavior);
1000 default:
1001 return madvise_behavior(vma, prev, start, end, behavior);
1002 }
1003}
1004
1005static bool
1006madvise_behavior_valid(int behavior)
1007{
1008 switch (behavior) {
1009 case MADV_DOFORK:
1010 case MADV_DONTFORK:
1011 case MADV_NORMAL:
1012 case MADV_SEQUENTIAL:
1013 case MADV_RANDOM:
1014 case MADV_REMOVE:
1015 case MADV_WILLNEED:
1016 case MADV_DONTNEED:
1017 case MADV_FREE:
1018 case MADV_COLD:
1019 case MADV_PAGEOUT:
1020 case MADV_POPULATE_READ:
1021 case MADV_POPULATE_WRITE:
1022#ifdef CONFIG_KSM
1023 case MADV_MERGEABLE:
1024 case MADV_UNMERGEABLE:
1025#endif
1026#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1027 case MADV_HUGEPAGE:
1028 case MADV_NOHUGEPAGE:
1029#endif
1030 case MADV_DONTDUMP:
1031 case MADV_DODUMP:
1032 case MADV_WIPEONFORK:
1033 case MADV_KEEPONFORK:
1034#ifdef CONFIG_MEMORY_FAILURE
1035 case MADV_SOFT_OFFLINE:
1036 case MADV_HWPOISON:
1037#endif
1038 return true;
1039
1040 default:
1041 return false;
1042 }
1043}
1044
1045static bool
1046process_madvise_behavior_valid(int behavior)
1047{
1048 switch (behavior) {
1049 case MADV_COLD:
1050 case MADV_PAGEOUT:
1051 case MADV_WILLNEED:
1052 return true;
1053 default:
1054 return false;
1055 }
1056}
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
1129{
1130 unsigned long end, tmp;
1131 struct vm_area_struct *vma, *prev;
1132 int unmapped_error = 0;
1133 int error = -EINVAL;
1134 int write;
1135 size_t len;
1136 struct blk_plug plug;
1137
1138 start = untagged_addr(start);
1139
1140 if (!madvise_behavior_valid(behavior))
1141 return error;
1142
1143 if (!PAGE_ALIGNED(start))
1144 return error;
1145 len = PAGE_ALIGN(len_in);
1146
1147
1148 if (len_in && !len)
1149 return error;
1150
1151 end = start + len;
1152 if (end < start)
1153 return error;
1154
1155 error = 0;
1156 if (end == start)
1157 return error;
1158
1159#ifdef CONFIG_MEMORY_FAILURE
1160 if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
1161 return madvise_inject_error(behavior, start, start + len_in);
1162#endif
1163
1164 write = madvise_need_mmap_write(behavior);
1165 if (write) {
1166 if (mmap_write_lock_killable(mm))
1167 return -EINTR;
1168 } else {
1169 mmap_read_lock(mm);
1170 }
1171
1172
1173
1174
1175
1176
1177 vma = find_vma_prev(mm, start, &prev);
1178 if (vma && start > vma->vm_start)
1179 prev = vma;
1180
1181 blk_start_plug(&plug);
1182 for (;;) {
1183
1184 error = -ENOMEM;
1185 if (!vma)
1186 goto out;
1187
1188
1189 if (start < vma->vm_start) {
1190 unmapped_error = -ENOMEM;
1191 start = vma->vm_start;
1192 if (start >= end)
1193 goto out;
1194 }
1195
1196
1197 tmp = vma->vm_end;
1198 if (end < tmp)
1199 tmp = end;
1200
1201
1202 error = madvise_vma(vma, &prev, start, tmp, behavior);
1203 if (error)
1204 goto out;
1205 start = tmp;
1206 if (prev && start < prev->vm_end)
1207 start = prev->vm_end;
1208 error = unmapped_error;
1209 if (start >= end)
1210 goto out;
1211 if (prev)
1212 vma = prev->vm_next;
1213 else
1214 vma = find_vma(mm, start);
1215 }
1216out:
1217 blk_finish_plug(&plug);
1218 if (write)
1219 mmap_write_unlock(mm);
1220 else
1221 mmap_read_unlock(mm);
1222
1223 return error;
1224}
1225
1226SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
1227{
1228 return do_madvise(current->mm, start, len_in, behavior);
1229}
1230
1231SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
1232 size_t, vlen, int, behavior, unsigned int, flags)
1233{
1234 ssize_t ret;
1235 struct iovec iovstack[UIO_FASTIOV], iovec;
1236 struct iovec *iov = iovstack;
1237 struct iov_iter iter;
1238 struct task_struct *task;
1239 struct mm_struct *mm;
1240 size_t total_len;
1241 unsigned int f_flags;
1242
1243 if (flags != 0) {
1244 ret = -EINVAL;
1245 goto out;
1246 }
1247
1248 ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
1249 if (ret < 0)
1250 goto out;
1251
1252 task = pidfd_get_task(pidfd, &f_flags);
1253 if (IS_ERR(task)) {
1254 ret = PTR_ERR(task);
1255 goto free_iov;
1256 }
1257
1258 if (!process_madvise_behavior_valid(behavior)) {
1259 ret = -EINVAL;
1260 goto release_task;
1261 }
1262
1263
1264 mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
1265 if (IS_ERR_OR_NULL(mm)) {
1266 ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
1267 goto release_task;
1268 }
1269
1270
1271
1272
1273
1274 if (!capable(CAP_SYS_NICE)) {
1275 ret = -EPERM;
1276 goto release_mm;
1277 }
1278
1279 total_len = iov_iter_count(&iter);
1280
1281 while (iov_iter_count(&iter)) {
1282 iovec = iov_iter_iovec(&iter);
1283 ret = do_madvise(mm, (unsigned long)iovec.iov_base,
1284 iovec.iov_len, behavior);
1285 if (ret < 0)
1286 break;
1287 iov_iter_advance(&iter, iovec.iov_len);
1288 }
1289
1290 if (ret == 0)
1291 ret = total_len - iov_iter_count(&iter);
1292
1293release_mm:
1294 mmput(mm);
1295release_task:
1296 put_task_struct(task);
1297free_iov:
1298 kfree(iov);
1299out:
1300 return ret;
1301}
1302