1
2
3
4
5
6
7
8
9#include <linux/mman.h>
10#include <linux/pagemap.h>
11#include <linux/syscalls.h>
12#include <linux/mempolicy.h>
13#include <linux/page-isolation.h>
14#include <linux/page_idle.h>
15#include <linux/userfaultfd_k.h>
16#include <linux/hugetlb.h>
17#include <linux/falloc.h>
18#include <linux/fadvise.h>
19#include <linux/sched.h>
20#include <linux/sched/mm.h>
21#include <linux/uio.h>
22#include <linux/ksm.h>
23#include <linux/fs.h>
24#include <linux/file.h>
25#include <linux/blkdev.h>
26#include <linux/backing-dev.h>
27#include <linux/pagewalk.h>
28#include <linux/swap.h>
29#include <linux/swapops.h>
30#include <linux/shmem_fs.h>
31#include <linux/mmu_notifier.h>
32
33#include <asm/tlb.h>
34
35#include "internal.h"
36
37struct madvise_walk_private {
38 struct mmu_gather *tlb;
39 bool pageout;
40};
41
42
43
44
45
46
47static int madvise_need_mmap_write(int behavior)
48{
49 switch (behavior) {
50 case MADV_REMOVE:
51 case MADV_WILLNEED:
52 case MADV_DONTNEED:
53 case MADV_COLD:
54 case MADV_PAGEOUT:
55 case MADV_FREE:
56 return 0;
57 default:
58
59 return 1;
60 }
61}
62
63
64
65
66
67static long madvise_behavior(struct vm_area_struct *vma,
68 struct vm_area_struct **prev,
69 unsigned long start, unsigned long end, int behavior)
70{
71 struct mm_struct *mm = vma->vm_mm;
72 int error = 0;
73 pgoff_t pgoff;
74 unsigned long new_flags = vma->vm_flags;
75
76 switch (behavior) {
77 case MADV_NORMAL:
78 new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
79 break;
80 case MADV_SEQUENTIAL:
81 new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
82 break;
83 case MADV_RANDOM:
84 new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
85 break;
86 case MADV_DONTFORK:
87 new_flags |= VM_DONTCOPY;
88 break;
89 case MADV_DOFORK:
90 if (vma->vm_flags & VM_IO) {
91 error = -EINVAL;
92 goto out;
93 }
94 new_flags &= ~VM_DONTCOPY;
95 break;
96 case MADV_WIPEONFORK:
97
98 if (vma->vm_file || vma->vm_flags & VM_SHARED) {
99 error = -EINVAL;
100 goto out;
101 }
102 new_flags |= VM_WIPEONFORK;
103 break;
104 case MADV_KEEPONFORK:
105 new_flags &= ~VM_WIPEONFORK;
106 break;
107 case MADV_DONTDUMP:
108 new_flags |= VM_DONTDUMP;
109 break;
110 case MADV_DODUMP:
111 if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) {
112 error = -EINVAL;
113 goto out;
114 }
115 new_flags &= ~VM_DONTDUMP;
116 break;
117 case MADV_MERGEABLE:
118 case MADV_UNMERGEABLE:
119 error = ksm_madvise(vma, start, end, behavior, &new_flags);
120 if (error)
121 goto out_convert_errno;
122 break;
123 case MADV_HUGEPAGE:
124 case MADV_NOHUGEPAGE:
125 error = hugepage_madvise(vma, &new_flags, behavior);
126 if (error)
127 goto out_convert_errno;
128 break;
129 }
130
131 if (new_flags == vma->vm_flags) {
132 *prev = vma;
133 goto out;
134 }
135
136 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
137 *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
138 vma->vm_file, pgoff, vma_policy(vma),
139 vma->vm_userfaultfd_ctx);
140 if (*prev) {
141 vma = *prev;
142 goto success;
143 }
144
145 *prev = vma;
146
147 if (start != vma->vm_start) {
148 if (unlikely(mm->map_count >= sysctl_max_map_count)) {
149 error = -ENOMEM;
150 goto out;
151 }
152 error = __split_vma(mm, vma, start, 1);
153 if (error)
154 goto out_convert_errno;
155 }
156
157 if (end != vma->vm_end) {
158 if (unlikely(mm->map_count >= sysctl_max_map_count)) {
159 error = -ENOMEM;
160 goto out;
161 }
162 error = __split_vma(mm, vma, end, 0);
163 if (error)
164 goto out_convert_errno;
165 }
166
167success:
168
169
170
171 vma->vm_flags = new_flags;
172
173out_convert_errno:
174
175
176
177
178 if (error == -ENOMEM)
179 error = -EAGAIN;
180out:
181 return error;
182}
183
184#ifdef CONFIG_SWAP
185static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
186 unsigned long end, struct mm_walk *walk)
187{
188 pte_t *orig_pte;
189 struct vm_area_struct *vma = walk->private;
190 unsigned long index;
191
192 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
193 return 0;
194
195 for (index = start; index != end; index += PAGE_SIZE) {
196 pte_t pte;
197 swp_entry_t entry;
198 struct page *page;
199 spinlock_t *ptl;
200
201 orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
202 pte = *(orig_pte + ((index - start) / PAGE_SIZE));
203 pte_unmap_unlock(orig_pte, ptl);
204
205 if (pte_present(pte) || pte_none(pte))
206 continue;
207 entry = pte_to_swp_entry(pte);
208 if (unlikely(non_swap_entry(entry)))
209 continue;
210
211 page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
212 vma, index, false);
213 if (page)
214 put_page(page);
215 }
216
217 return 0;
218}
219
220static const struct mm_walk_ops swapin_walk_ops = {
221 .pmd_entry = swapin_walk_pmd_entry,
222};
223
224static void force_shm_swapin_readahead(struct vm_area_struct *vma,
225 unsigned long start, unsigned long end,
226 struct address_space *mapping)
227{
228 XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start));
229 pgoff_t end_index = linear_page_index(vma, end + PAGE_SIZE - 1);
230 struct page *page;
231
232 rcu_read_lock();
233 xas_for_each(&xas, page, end_index) {
234 swp_entry_t swap;
235
236 if (!xa_is_value(page))
237 continue;
238 xas_pause(&xas);
239 rcu_read_unlock();
240
241 swap = radix_to_swp_entry(page);
242 page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
243 NULL, 0, false);
244 if (page)
245 put_page(page);
246
247 rcu_read_lock();
248 }
249 rcu_read_unlock();
250
251 lru_add_drain();
252}
253#endif
254
255
256
257
258static long madvise_willneed(struct vm_area_struct *vma,
259 struct vm_area_struct **prev,
260 unsigned long start, unsigned long end)
261{
262 struct mm_struct *mm = vma->vm_mm;
263 struct file *file = vma->vm_file;
264 loff_t offset;
265
266 *prev = vma;
267#ifdef CONFIG_SWAP
268 if (!file) {
269 walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma);
270 lru_add_drain();
271 return 0;
272 }
273
274 if (shmem_mapping(file->f_mapping)) {
275 force_shm_swapin_readahead(vma, start, end,
276 file->f_mapping);
277 return 0;
278 }
279#else
280 if (!file)
281 return -EBADF;
282#endif
283
284 if (IS_DAX(file_inode(file))) {
285
286 return 0;
287 }
288
289
290
291
292
293
294
295 *prev = NULL;
296 get_file(file);
297 offset = (loff_t)(start - vma->vm_start)
298 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
299 mmap_read_unlock(mm);
300 vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
301 fput(file);
302 mmap_read_lock(mm);
303 return 0;
304}
305
306static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
307 unsigned long addr, unsigned long end,
308 struct mm_walk *walk)
309{
310 struct madvise_walk_private *private = walk->private;
311 struct mmu_gather *tlb = private->tlb;
312 bool pageout = private->pageout;
313 struct mm_struct *mm = tlb->mm;
314 struct vm_area_struct *vma = walk->vma;
315 pte_t *orig_pte, *pte, ptent;
316 spinlock_t *ptl;
317 struct page *page = NULL;
318 LIST_HEAD(page_list);
319
320 if (fatal_signal_pending(current))
321 return -EINTR;
322
323#ifdef CONFIG_TRANSPARENT_HUGEPAGE
324 if (pmd_trans_huge(*pmd)) {
325 pmd_t orig_pmd;
326 unsigned long next = pmd_addr_end(addr, end);
327
328 tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
329 ptl = pmd_trans_huge_lock(pmd, vma);
330 if (!ptl)
331 return 0;
332
333 orig_pmd = *pmd;
334 if (is_huge_zero_pmd(orig_pmd))
335 goto huge_unlock;
336
337 if (unlikely(!pmd_present(orig_pmd))) {
338 VM_BUG_ON(thp_migration_supported() &&
339 !is_pmd_migration_entry(orig_pmd));
340 goto huge_unlock;
341 }
342
343 page = pmd_page(orig_pmd);
344
345
346 if (page_mapcount(page) != 1)
347 goto huge_unlock;
348
349 if (next - addr != HPAGE_PMD_SIZE) {
350 int err;
351
352 get_page(page);
353 spin_unlock(ptl);
354 lock_page(page);
355 err = split_huge_page(page);
356 unlock_page(page);
357 put_page(page);
358 if (!err)
359 goto regular_page;
360 return 0;
361 }
362
363 if (pmd_young(orig_pmd)) {
364 pmdp_invalidate(vma, addr, pmd);
365 orig_pmd = pmd_mkold(orig_pmd);
366
367 set_pmd_at(mm, addr, pmd, orig_pmd);
368 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
369 }
370
371 ClearPageReferenced(page);
372 test_and_clear_page_young(page);
373 if (pageout) {
374 if (!isolate_lru_page(page)) {
375 if (PageUnevictable(page))
376 putback_lru_page(page);
377 else
378 list_add(&page->lru, &page_list);
379 }
380 } else
381 deactivate_page(page);
382huge_unlock:
383 spin_unlock(ptl);
384 if (pageout)
385 reclaim_pages(&page_list);
386 return 0;
387 }
388
389regular_page:
390 if (pmd_trans_unstable(pmd))
391 return 0;
392#endif
393 tlb_change_page_size(tlb, PAGE_SIZE);
394 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
395 flush_tlb_batched_pending(mm);
396 arch_enter_lazy_mmu_mode();
397 for (; addr < end; pte++, addr += PAGE_SIZE) {
398 ptent = *pte;
399
400 if (pte_none(ptent))
401 continue;
402
403 if (!pte_present(ptent))
404 continue;
405
406 page = vm_normal_page(vma, addr, ptent);
407 if (!page)
408 continue;
409
410
411
412
413
414 if (PageTransCompound(page)) {
415 if (page_mapcount(page) != 1)
416 break;
417 get_page(page);
418 if (!trylock_page(page)) {
419 put_page(page);
420 break;
421 }
422 pte_unmap_unlock(orig_pte, ptl);
423 if (split_huge_page(page)) {
424 unlock_page(page);
425 put_page(page);
426 pte_offset_map_lock(mm, pmd, addr, &ptl);
427 break;
428 }
429 unlock_page(page);
430 put_page(page);
431 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
432 pte--;
433 addr -= PAGE_SIZE;
434 continue;
435 }
436
437
438 if (page_mapcount(page) != 1)
439 continue;
440
441 VM_BUG_ON_PAGE(PageTransCompound(page), page);
442
443 if (pte_young(ptent)) {
444 ptent = ptep_get_and_clear_full(mm, addr, pte,
445 tlb->fullmm);
446 ptent = pte_mkold(ptent);
447 set_pte_at(mm, addr, pte, ptent);
448 tlb_remove_tlb_entry(tlb, pte, addr);
449 }
450
451
452
453
454
455
456
457 ClearPageReferenced(page);
458 test_and_clear_page_young(page);
459 if (pageout) {
460 if (!isolate_lru_page(page)) {
461 if (PageUnevictable(page))
462 putback_lru_page(page);
463 else
464 list_add(&page->lru, &page_list);
465 }
466 } else
467 deactivate_page(page);
468 }
469
470 arch_leave_lazy_mmu_mode();
471 pte_unmap_unlock(orig_pte, ptl);
472 if (pageout)
473 reclaim_pages(&page_list);
474 cond_resched();
475
476 return 0;
477}
478
479static const struct mm_walk_ops cold_walk_ops = {
480 .pmd_entry = madvise_cold_or_pageout_pte_range,
481};
482
483static void madvise_cold_page_range(struct mmu_gather *tlb,
484 struct vm_area_struct *vma,
485 unsigned long addr, unsigned long end)
486{
487 struct madvise_walk_private walk_private = {
488 .pageout = false,
489 .tlb = tlb,
490 };
491
492 tlb_start_vma(tlb, vma);
493 walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
494 tlb_end_vma(tlb, vma);
495}
496
497static long madvise_cold(struct vm_area_struct *vma,
498 struct vm_area_struct **prev,
499 unsigned long start_addr, unsigned long end_addr)
500{
501 struct mm_struct *mm = vma->vm_mm;
502 struct mmu_gather tlb;
503
504 *prev = vma;
505 if (!can_madv_lru_vma(vma))
506 return -EINVAL;
507
508 lru_add_drain();
509 tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
510 madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
511 tlb_finish_mmu(&tlb, start_addr, end_addr);
512
513 return 0;
514}
515
516static void madvise_pageout_page_range(struct mmu_gather *tlb,
517 struct vm_area_struct *vma,
518 unsigned long addr, unsigned long end)
519{
520 struct madvise_walk_private walk_private = {
521 .pageout = true,
522 .tlb = tlb,
523 };
524
525 tlb_start_vma(tlb, vma);
526 walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
527 tlb_end_vma(tlb, vma);
528}
529
530static inline bool can_do_pageout(struct vm_area_struct *vma)
531{
532 if (vma_is_anonymous(vma))
533 return true;
534 if (!vma->vm_file)
535 return false;
536
537
538
539
540
541
542 return inode_owner_or_capable(file_inode(vma->vm_file)) ||
543 inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0;
544}
545
546static long madvise_pageout(struct vm_area_struct *vma,
547 struct vm_area_struct **prev,
548 unsigned long start_addr, unsigned long end_addr)
549{
550 struct mm_struct *mm = vma->vm_mm;
551 struct mmu_gather tlb;
552
553 *prev = vma;
554 if (!can_madv_lru_vma(vma))
555 return -EINVAL;
556
557 if (!can_do_pageout(vma))
558 return 0;
559
560 lru_add_drain();
561 tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
562 madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
563 tlb_finish_mmu(&tlb, start_addr, end_addr);
564
565 return 0;
566}
567
568static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
569 unsigned long end, struct mm_walk *walk)
570
571{
572 struct mmu_gather *tlb = walk->private;
573 struct mm_struct *mm = tlb->mm;
574 struct vm_area_struct *vma = walk->vma;
575 spinlock_t *ptl;
576 pte_t *orig_pte, *pte, ptent;
577 struct page *page;
578 int nr_swap = 0;
579 unsigned long next;
580
581 next = pmd_addr_end(addr, end);
582 if (pmd_trans_huge(*pmd))
583 if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
584 goto next;
585
586 if (pmd_trans_unstable(pmd))
587 return 0;
588
589 tlb_change_page_size(tlb, PAGE_SIZE);
590 orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
591 flush_tlb_batched_pending(mm);
592 arch_enter_lazy_mmu_mode();
593 for (; addr != end; pte++, addr += PAGE_SIZE) {
594 ptent = *pte;
595
596 if (pte_none(ptent))
597 continue;
598
599
600
601
602
603 if (!pte_present(ptent)) {
604 swp_entry_t entry;
605
606 entry = pte_to_swp_entry(ptent);
607 if (non_swap_entry(entry))
608 continue;
609 nr_swap--;
610 free_swap_and_cache(entry);
611 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
612 continue;
613 }
614
615 page = vm_normal_page(vma, addr, ptent);
616 if (!page)
617 continue;
618
619
620
621
622
623
624 if (PageTransCompound(page)) {
625 if (page_mapcount(page) != 1)
626 goto out;
627 get_page(page);
628 if (!trylock_page(page)) {
629 put_page(page);
630 goto out;
631 }
632 pte_unmap_unlock(orig_pte, ptl);
633 if (split_huge_page(page)) {
634 unlock_page(page);
635 put_page(page);
636 pte_offset_map_lock(mm, pmd, addr, &ptl);
637 goto out;
638 }
639 unlock_page(page);
640 put_page(page);
641 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
642 pte--;
643 addr -= PAGE_SIZE;
644 continue;
645 }
646
647 VM_BUG_ON_PAGE(PageTransCompound(page), page);
648
649 if (PageSwapCache(page) || PageDirty(page)) {
650 if (!trylock_page(page))
651 continue;
652
653
654
655
656 if (page_mapcount(page) != 1) {
657 unlock_page(page);
658 continue;
659 }
660
661 if (PageSwapCache(page) && !try_to_free_swap(page)) {
662 unlock_page(page);
663 continue;
664 }
665
666 ClearPageDirty(page);
667 unlock_page(page);
668 }
669
670 if (pte_young(ptent) || pte_dirty(ptent)) {
671
672
673
674
675
676
677 ptent = ptep_get_and_clear_full(mm, addr, pte,
678 tlb->fullmm);
679
680 ptent = pte_mkold(ptent);
681 ptent = pte_mkclean(ptent);
682 set_pte_at(mm, addr, pte, ptent);
683 tlb_remove_tlb_entry(tlb, pte, addr);
684 }
685 mark_page_lazyfree(page);
686 }
687out:
688 if (nr_swap) {
689 if (current->mm == mm)
690 sync_mm_rss(mm);
691
692 add_mm_counter(mm, MM_SWAPENTS, nr_swap);
693 }
694 arch_leave_lazy_mmu_mode();
695 pte_unmap_unlock(orig_pte, ptl);
696 cond_resched();
697next:
698 return 0;
699}
700
701static const struct mm_walk_ops madvise_free_walk_ops = {
702 .pmd_entry = madvise_free_pte_range,
703};
704
705static int madvise_free_single_vma(struct vm_area_struct *vma,
706 unsigned long start_addr, unsigned long end_addr)
707{
708 struct mm_struct *mm = vma->vm_mm;
709 struct mmu_notifier_range range;
710 struct mmu_gather tlb;
711
712
713 if (!vma_is_anonymous(vma))
714 return -EINVAL;
715
716 range.start = max(vma->vm_start, start_addr);
717 if (range.start >= vma->vm_end)
718 return -EINVAL;
719 range.end = min(vma->vm_end, end_addr);
720 if (range.end <= vma->vm_start)
721 return -EINVAL;
722 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
723 range.start, range.end);
724
725 lru_add_drain();
726 tlb_gather_mmu(&tlb, mm, range.start, range.end);
727 update_hiwater_rss(mm);
728
729 mmu_notifier_invalidate_range_start(&range);
730 tlb_start_vma(&tlb, vma);
731 walk_page_range(vma->vm_mm, range.start, range.end,
732 &madvise_free_walk_ops, &tlb);
733 tlb_end_vma(&tlb, vma);
734 mmu_notifier_invalidate_range_end(&range);
735 tlb_finish_mmu(&tlb, range.start, range.end);
736
737 return 0;
738}
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
760 unsigned long start, unsigned long end)
761{
762 zap_page_range(vma, start, end - start);
763 return 0;
764}
765
766static long madvise_dontneed_free(struct vm_area_struct *vma,
767 struct vm_area_struct **prev,
768 unsigned long start, unsigned long end,
769 int behavior)
770{
771 struct mm_struct *mm = vma->vm_mm;
772
773 *prev = vma;
774 if (!can_madv_lru_vma(vma))
775 return -EINVAL;
776
777 if (!userfaultfd_remove(vma, start, end)) {
778 *prev = NULL;
779
780 mmap_read_lock(mm);
781 vma = find_vma(mm, start);
782 if (!vma)
783 return -ENOMEM;
784 if (start < vma->vm_start) {
785
786
787
788
789
790
791
792
793
794 return -ENOMEM;
795 }
796 if (!can_madv_lru_vma(vma))
797 return -EINVAL;
798 if (end > vma->vm_end) {
799
800
801
802
803
804
805
806
807
808
809
810
811 end = vma->vm_end;
812 }
813 VM_WARN_ON(start >= end);
814 }
815
816 if (behavior == MADV_DONTNEED)
817 return madvise_dontneed_single_vma(vma, start, end);
818 else if (behavior == MADV_FREE)
819 return madvise_free_single_vma(vma, start, end);
820 else
821 return -EINVAL;
822}
823
824
825
826
827
828static long madvise_remove(struct vm_area_struct *vma,
829 struct vm_area_struct **prev,
830 unsigned long start, unsigned long end)
831{
832 loff_t offset;
833 int error;
834 struct file *f;
835 struct mm_struct *mm = vma->vm_mm;
836
837 *prev = NULL;
838
839 if (vma->vm_flags & VM_LOCKED)
840 return -EINVAL;
841
842 f = vma->vm_file;
843
844 if (!f || !f->f_mapping || !f->f_mapping->host) {
845 return -EINVAL;
846 }
847
848 if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
849 return -EACCES;
850
851 offset = (loff_t)(start - vma->vm_start)
852 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
853
854
855
856
857
858
859
860 get_file(f);
861 if (userfaultfd_remove(vma, start, end)) {
862
863 mmap_read_unlock(mm);
864 }
865 error = vfs_fallocate(f,
866 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
867 offset, end - start);
868 fput(f);
869 mmap_read_lock(mm);
870 return error;
871}
872
873#ifdef CONFIG_MEMORY_FAILURE
874
875
876
877static int madvise_inject_error(int behavior,
878 unsigned long start, unsigned long end)
879{
880 unsigned long size;
881
882 if (!capable(CAP_SYS_ADMIN))
883 return -EPERM;
884
885
886 for (; start < end; start += size) {
887 unsigned long pfn;
888 struct page *page;
889 int ret;
890
891 ret = get_user_pages_fast(start, 1, 0, &page);
892 if (ret != 1)
893 return ret;
894 pfn = page_to_pfn(page);
895
896
897
898
899
900
901 size = page_size(compound_head(page));
902
903 if (behavior == MADV_SOFT_OFFLINE) {
904 pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
905 pfn, start);
906 ret = soft_offline_page(pfn, MF_COUNT_INCREASED);
907 } else {
908 pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
909 pfn, start);
910 ret = memory_failure(pfn, MF_COUNT_INCREASED);
911 }
912
913 if (ret)
914 return ret;
915 }
916
917 return 0;
918}
919#endif
920
921static long
922madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
923 unsigned long start, unsigned long end, int behavior)
924{
925 switch (behavior) {
926 case MADV_REMOVE:
927 return madvise_remove(vma, prev, start, end);
928 case MADV_WILLNEED:
929 return madvise_willneed(vma, prev, start, end);
930 case MADV_COLD:
931 return madvise_cold(vma, prev, start, end);
932 case MADV_PAGEOUT:
933 return madvise_pageout(vma, prev, start, end);
934 case MADV_FREE:
935 case MADV_DONTNEED:
936 return madvise_dontneed_free(vma, prev, start, end, behavior);
937 default:
938 return madvise_behavior(vma, prev, start, end, behavior);
939 }
940}
941
942static bool
943madvise_behavior_valid(int behavior)
944{
945 switch (behavior) {
946 case MADV_DOFORK:
947 case MADV_DONTFORK:
948 case MADV_NORMAL:
949 case MADV_SEQUENTIAL:
950 case MADV_RANDOM:
951 case MADV_REMOVE:
952 case MADV_WILLNEED:
953 case MADV_DONTNEED:
954 case MADV_FREE:
955 case MADV_COLD:
956 case MADV_PAGEOUT:
957#ifdef CONFIG_KSM
958 case MADV_MERGEABLE:
959 case MADV_UNMERGEABLE:
960#endif
961#ifdef CONFIG_TRANSPARENT_HUGEPAGE
962 case MADV_HUGEPAGE:
963 case MADV_NOHUGEPAGE:
964#endif
965 case MADV_DONTDUMP:
966 case MADV_DODUMP:
967 case MADV_WIPEONFORK:
968 case MADV_KEEPONFORK:
969#ifdef CONFIG_MEMORY_FAILURE
970 case MADV_SOFT_OFFLINE:
971 case MADV_HWPOISON:
972#endif
973 return true;
974
975 default:
976 return false;
977 }
978}
979
980static bool
981process_madvise_behavior_valid(int behavior)
982{
983 switch (behavior) {
984 case MADV_COLD:
985 case MADV_PAGEOUT:
986 return true;
987 default:
988 return false;
989 }
990}
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
1059{
1060 unsigned long end, tmp;
1061 struct vm_area_struct *vma, *prev;
1062 int unmapped_error = 0;
1063 int error = -EINVAL;
1064 int write;
1065 size_t len;
1066 struct blk_plug plug;
1067
1068 start = untagged_addr(start);
1069
1070 if (!madvise_behavior_valid(behavior))
1071 return error;
1072
1073 if (!PAGE_ALIGNED(start))
1074 return error;
1075 len = PAGE_ALIGN(len_in);
1076
1077
1078 if (len_in && !len)
1079 return error;
1080
1081 end = start + len;
1082 if (end < start)
1083 return error;
1084
1085 error = 0;
1086 if (end == start)
1087 return error;
1088
1089#ifdef CONFIG_MEMORY_FAILURE
1090 if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
1091 return madvise_inject_error(behavior, start, start + len_in);
1092#endif
1093
1094 write = madvise_need_mmap_write(behavior);
1095 if (write) {
1096 if (mmap_write_lock_killable(mm))
1097 return -EINTR;
1098 } else {
1099 mmap_read_lock(mm);
1100 }
1101
1102
1103
1104
1105
1106
1107 vma = find_vma_prev(mm, start, &prev);
1108 if (vma && start > vma->vm_start)
1109 prev = vma;
1110
1111 blk_start_plug(&plug);
1112 for (;;) {
1113
1114 error = -ENOMEM;
1115 if (!vma)
1116 goto out;
1117
1118
1119 if (start < vma->vm_start) {
1120 unmapped_error = -ENOMEM;
1121 start = vma->vm_start;
1122 if (start >= end)
1123 goto out;
1124 }
1125
1126
1127 tmp = vma->vm_end;
1128 if (end < tmp)
1129 tmp = end;
1130
1131
1132 error = madvise_vma(vma, &prev, start, tmp, behavior);
1133 if (error)
1134 goto out;
1135 start = tmp;
1136 if (prev && start < prev->vm_end)
1137 start = prev->vm_end;
1138 error = unmapped_error;
1139 if (start >= end)
1140 goto out;
1141 if (prev)
1142 vma = prev->vm_next;
1143 else
1144 vma = find_vma(mm, start);
1145 }
1146out:
1147 blk_finish_plug(&plug);
1148 if (write)
1149 mmap_write_unlock(mm);
1150 else
1151 mmap_read_unlock(mm);
1152
1153 return error;
1154}
1155
1156SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
1157{
1158 return do_madvise(current->mm, start, len_in, behavior);
1159}
1160
1161SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
1162 size_t, vlen, int, behavior, unsigned int, flags)
1163{
1164 ssize_t ret;
1165 struct iovec iovstack[UIO_FASTIOV], iovec;
1166 struct iovec *iov = iovstack;
1167 struct iov_iter iter;
1168 struct pid *pid;
1169 struct task_struct *task;
1170 struct mm_struct *mm;
1171 size_t total_len;
1172 unsigned int f_flags;
1173
1174 if (flags != 0) {
1175 ret = -EINVAL;
1176 goto out;
1177 }
1178
1179 ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
1180 if (ret < 0)
1181 goto out;
1182
1183 pid = pidfd_get_pid(pidfd, &f_flags);
1184 if (IS_ERR(pid)) {
1185 ret = PTR_ERR(pid);
1186 goto free_iov;
1187 }
1188
1189 task = get_pid_task(pid, PIDTYPE_PID);
1190 if (!task) {
1191 ret = -ESRCH;
1192 goto put_pid;
1193 }
1194
1195 if (!process_madvise_behavior_valid(behavior)) {
1196 ret = -EINVAL;
1197 goto release_task;
1198 }
1199
1200 mm = mm_access(task, PTRACE_MODE_ATTACH_FSCREDS);
1201 if (IS_ERR_OR_NULL(mm)) {
1202 ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
1203 goto release_task;
1204 }
1205
1206 total_len = iov_iter_count(&iter);
1207
1208 while (iov_iter_count(&iter)) {
1209 iovec = iov_iter_iovec(&iter);
1210 ret = do_madvise(mm, (unsigned long)iovec.iov_base,
1211 iovec.iov_len, behavior);
1212 if (ret < 0)
1213 break;
1214 iov_iter_advance(&iter, iovec.iov_len);
1215 }
1216
1217 if (ret == 0)
1218 ret = total_len - iov_iter_count(&iter);
1219
1220 mmput(mm);
1221release_task:
1222 put_task_struct(task);
1223put_pid:
1224 put_pid(pid);
1225free_iov:
1226 kfree(iov);
1227out:
1228 return ret;
1229}
1230