1
2
3
4
5
6
7
8
9#include <linux/mman.h>
10#include <linux/pagemap.h>
11#include <linux/syscalls.h>
12#include <linux/mempolicy.h>
13#include <linux/page-isolation.h>
14#include <linux/page_idle.h>
15#include <linux/userfaultfd_k.h>
16#include <linux/hugetlb.h>
17#include <linux/falloc.h>
18#include <linux/fadvise.h>
19#include <linux/sched.h>
20#include <linux/sched/mm.h>
21#include <linux/uio.h>
22#include <linux/ksm.h>
23#include <linux/fs.h>
24#include <linux/file.h>
25#include <linux/blkdev.h>
26#include <linux/backing-dev.h>
27#include <linux/pagewalk.h>
28#include <linux/swap.h>
29#include <linux/swapops.h>
30#include <linux/shmem_fs.h>
31#include <linux/mmu_notifier.h>
32
33#include <asm/tlb.h>
34
35#include "internal.h"
36
37struct madvise_walk_private {
38 struct mmu_gather *tlb;
39 bool pageout;
40};
41
42
43
44
45
46
47static int madvise_need_mmap_write(int behavior)
48{
49 switch (behavior) {
50 case MADV_REMOVE:
51 case MADV_WILLNEED:
52 case MADV_DONTNEED:
53 case MADV_COLD:
54 case MADV_PAGEOUT:
55 case MADV_FREE:
56 return 0;
57 default:
58
59 return 1;
60 }
61}
62
63
64
65
66
67static long madvise_behavior(struct vm_area_struct *vma,
68 struct vm_area_struct **prev,
69 unsigned long start, unsigned long end, int behavior)
70{
71 struct mm_struct *mm = vma->vm_mm;
72 int error = 0;
73 pgoff_t pgoff;
74 unsigned long new_flags = vma->vm_flags;
75
76 switch (behavior) {
77 case MADV_NORMAL:
78 new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
79 break;
80 case MADV_SEQUENTIAL:
81 new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
82 break;
83 case MADV_RANDOM:
84 new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
85 break;
86 case MADV_DONTFORK:
87 new_flags |= VM_DONTCOPY;
88 break;
89 case MADV_DOFORK:
90 if (vma->vm_flags & VM_IO) {
91 error = -EINVAL;
92 goto out;
93 }
94 new_flags &= ~VM_DONTCOPY;
95 break;
96 case MADV_WIPEONFORK:
97
98 if (vma->vm_file || vma->vm_flags & VM_SHARED) {
99 error = -EINVAL;
100 goto out;
101 }
102 new_flags |= VM_WIPEONFORK;
103 break;
104 case MADV_KEEPONFORK:
105 new_flags &= ~VM_WIPEONFORK;
106 break;
107 case MADV_DONTDUMP:
108 new_flags |= VM_DONTDUMP;
109 break;
110 case MADV_DODUMP:
111 if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) {
112 error = -EINVAL;
113 goto out;
114 }
115 new_flags &= ~VM_DONTDUMP;
116 break;
117 case MADV_MERGEABLE:
118 case MADV_UNMERGEABLE:
119 error = ksm_madvise(vma, start, end, behavior, &new_flags);
120 if (error)
121 goto out_convert_errno;
122 break;
123 case MADV_HUGEPAGE:
124 case MADV_NOHUGEPAGE:
125 error = hugepage_madvise(vma, &new_flags, behavior);
126 if (error)
127 goto out_convert_errno;
128 break;
129 }
130
131 if (new_flags == vma->vm_flags) {
132 *prev = vma;
133 goto out;
134 }
135
136 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
137 *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
138 vma->vm_file, pgoff, vma_policy(vma),
139 vma->vm_userfaultfd_ctx);
140 if (*prev) {
141 vma = *prev;
142 goto success;
143 }
144
145 *prev = vma;
146
147 if (start != vma->vm_start) {
148 if (unlikely(mm->map_count >= sysctl_max_map_count)) {
149 error = -ENOMEM;
150 goto out;
151 }
152 error = __split_vma(mm, vma, start, 1);
153 if (error)
154 goto out_convert_errno;
155 }
156
157 if (end != vma->vm_end) {
158 if (unlikely(mm->map_count >= sysctl_max_map_count)) {
159 error = -ENOMEM;
160 goto out;
161 }
162 error = __split_vma(mm, vma, end, 0);
163 if (error)
164 goto out_convert_errno;
165 }
166
167success:
168
169
170
171 vma->vm_flags = new_flags;
172
173out_convert_errno:
174
175
176
177
178 if (error == -ENOMEM)
179 error = -EAGAIN;
180out:
181 return error;
182}
183
184#ifdef CONFIG_SWAP
185static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
186 unsigned long end, struct mm_walk *walk)
187{
188 pte_t *orig_pte;
189 struct vm_area_struct *vma = walk->private;
190 unsigned long index;
191
192 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
193 return 0;
194
195 for (index = start; index != end; index += PAGE_SIZE) {
196 pte_t pte;
197 swp_entry_t entry;
198 struct page *page;
199 spinlock_t *ptl;
200
201 orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
202 pte = *(orig_pte + ((index - start) / PAGE_SIZE));
203 pte_unmap_unlock(orig_pte, ptl);
204
205 if (pte_present(pte) || pte_none(pte))
206 continue;
207 entry = pte_to_swp_entry(pte);
208 if (unlikely(non_swap_entry(entry)))
209 continue;
210
211 page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
212 vma, index, false);
213 if (page)
214 put_page(page);
215 }
216
217 return 0;
218}
219
220static const struct mm_walk_ops swapin_walk_ops = {
221 .pmd_entry = swapin_walk_pmd_entry,
222};
223
224static void force_shm_swapin_readahead(struct vm_area_struct *vma,
225 unsigned long start, unsigned long end,
226 struct address_space *mapping)
227{
228 XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start));
229 pgoff_t end_index = linear_page_index(vma, end + PAGE_SIZE - 1);
230 struct page *page;
231
232 rcu_read_lock();
233 xas_for_each(&xas, page, end_index) {
234 swp_entry_t swap;
235
236 if (!xa_is_value(page))
237 continue;
238 xas_pause(&xas);
239 rcu_read_unlock();
240
241 swap = radix_to_swp_entry(page);
242 page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
243 NULL, 0, false);
244 if (page)
245 put_page(page);
246
247 rcu_read_lock();
248 }
249 rcu_read_unlock();
250
251 lru_add_drain();
252}
253#endif
254
255
256
257
258static long madvise_willneed(struct vm_area_struct *vma,
259 struct vm_area_struct **prev,
260 unsigned long start, unsigned long end)
261{
262 struct mm_struct *mm = vma->vm_mm;
263 struct file *file = vma->vm_file;
264 loff_t offset;
265
266 *prev = vma;
267#ifdef CONFIG_SWAP
268 if (!file) {
269 walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma);
270 lru_add_drain();
271 return 0;
272 }
273
274 if (shmem_mapping(file->f_mapping)) {
275 force_shm_swapin_readahead(vma, start, end,
276 file->f_mapping);
277 return 0;
278 }
279#else
280 if (!file)
281 return -EBADF;
282#endif
283
284 if (IS_DAX(file_inode(file))) {
285
286 return 0;
287 }
288
289
290
291
292
293
294
295 *prev = NULL;
296 get_file(file);
297 offset = (loff_t)(start - vma->vm_start)
298 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
299 mmap_read_unlock(mm);
300 vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
301 fput(file);
302 mmap_read_lock(mm);
303 return 0;
304}
305
306static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
307 unsigned long addr, unsigned long end,
308 struct mm_walk *walk)
309{
310 struct madvise_walk_private *private = walk->private;
311 struct mmu_gather *tlb = private->tlb;
312 bool pageout = private->pageout;
313 struct mm_struct *mm = tlb->mm;
314 struct vm_area_struct *vma = walk->vma;
315 pte_t *orig_pte, *pte, ptent;
316 spinlock_t *ptl;
317 struct page *page = NULL;
318 LIST_HEAD(page_list);
319
320 if (fatal_signal_pending(current))
321 return -EINTR;
322
323#ifdef CONFIG_TRANSPARENT_HUGEPAGE
324 if (pmd_trans_huge(*pmd)) {
325 pmd_t orig_pmd;
326 unsigned long next = pmd_addr_end(addr, end);
327
328 tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
329 ptl = pmd_trans_huge_lock(pmd, vma);
330 if (!ptl)
331 return 0;
332
333 orig_pmd = *pmd;
334 if (is_huge_zero_pmd(orig_pmd))
335 goto huge_unlock;
336
337 if (unlikely(!pmd_present(orig_pmd))) {
338 VM_BUG_ON(thp_migration_supported() &&
339 !is_pmd_migration_entry(orig_pmd));
340 goto huge_unlock;
341 }
342
343 page = pmd_page(orig_pmd);
344
345
346 if (page_mapcount(page) != 1)
347 goto huge_unlock;
348
349 if (next - addr != HPAGE_PMD_SIZE) {
350 int err;
351
352 get_page(page);
353 spin_unlock(ptl);
354 lock_page(page);
355 err = split_huge_page(page);
356 unlock_page(page);
357 put_page(page);
358 if (!err)
359 goto regular_page;
360 return 0;
361 }
362
363 if (pmd_young(orig_pmd)) {
364 pmdp_invalidate(vma, addr, pmd);
365 orig_pmd = pmd_mkold(orig_pmd);
366
367 set_pmd_at(mm, addr, pmd, orig_pmd);
368 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
369 }
370
371 ClearPageReferenced(page);
372 test_and_clear_page_young(page);
373 if (pageout) {
374 if (!isolate_lru_page(page)) {
375 if (PageUnevictable(page))
376 putback_lru_page(page);
377 else
378 list_add(&page->lru, &page_list);
379 }
380 } else
381 deactivate_page(page);
382huge_unlock:
383 spin_unlock(ptl);
384 if (pageout)
385 reclaim_pages(&page_list);
386 return 0;
387 }
388
389regular_page:
390 if (pmd_trans_unstable(pmd))
391 return 0;
392#endif
393 tlb_change_page_size(tlb, PAGE_SIZE);
394 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
395 flush_tlb_batched_pending(mm);
396 arch_enter_lazy_mmu_mode();
397 for (; addr < end; pte++, addr += PAGE_SIZE) {
398 ptent = *pte;
399
400 if (pte_none(ptent))
401 continue;
402
403 if (!pte_present(ptent))
404 continue;
405
406 page = vm_normal_page(vma, addr, ptent);
407 if (!page)
408 continue;
409
410
411
412
413
414 if (PageTransCompound(page)) {
415 if (page_mapcount(page) != 1)
416 break;
417 get_page(page);
418 if (!trylock_page(page)) {
419 put_page(page);
420 break;
421 }
422 pte_unmap_unlock(orig_pte, ptl);
423 if (split_huge_page(page)) {
424 unlock_page(page);
425 put_page(page);
426 pte_offset_map_lock(mm, pmd, addr, &ptl);
427 break;
428 }
429 unlock_page(page);
430 put_page(page);
431 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
432 pte--;
433 addr -= PAGE_SIZE;
434 continue;
435 }
436
437
438 if (page_mapcount(page) != 1)
439 continue;
440
441 VM_BUG_ON_PAGE(PageTransCompound(page), page);
442
443 if (pte_young(ptent)) {
444 ptent = ptep_get_and_clear_full(mm, addr, pte,
445 tlb->fullmm);
446 ptent = pte_mkold(ptent);
447 set_pte_at(mm, addr, pte, ptent);
448 tlb_remove_tlb_entry(tlb, pte, addr);
449 }
450
451
452
453
454
455
456
457 ClearPageReferenced(page);
458 test_and_clear_page_young(page);
459 if (pageout) {
460 if (!isolate_lru_page(page)) {
461 if (PageUnevictable(page))
462 putback_lru_page(page);
463 else
464 list_add(&page->lru, &page_list);
465 }
466 } else
467 deactivate_page(page);
468 }
469
470 arch_leave_lazy_mmu_mode();
471 pte_unmap_unlock(orig_pte, ptl);
472 if (pageout)
473 reclaim_pages(&page_list);
474 cond_resched();
475
476 return 0;
477}
478
479static const struct mm_walk_ops cold_walk_ops = {
480 .pmd_entry = madvise_cold_or_pageout_pte_range,
481};
482
483static void madvise_cold_page_range(struct mmu_gather *tlb,
484 struct vm_area_struct *vma,
485 unsigned long addr, unsigned long end)
486{
487 struct madvise_walk_private walk_private = {
488 .pageout = false,
489 .tlb = tlb,
490 };
491
492 tlb_start_vma(tlb, vma);
493 walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
494 tlb_end_vma(tlb, vma);
495}
496
497static long madvise_cold(struct vm_area_struct *vma,
498 struct vm_area_struct **prev,
499 unsigned long start_addr, unsigned long end_addr)
500{
501 struct mm_struct *mm = vma->vm_mm;
502 struct mmu_gather tlb;
503
504 *prev = vma;
505 if (!can_madv_lru_vma(vma))
506 return -EINVAL;
507
508 lru_add_drain();
509 tlb_gather_mmu(&tlb, mm);
510 madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
511 tlb_finish_mmu(&tlb);
512
513 return 0;
514}
515
516static void madvise_pageout_page_range(struct mmu_gather *tlb,
517 struct vm_area_struct *vma,
518 unsigned long addr, unsigned long end)
519{
520 struct madvise_walk_private walk_private = {
521 .pageout = true,
522 .tlb = tlb,
523 };
524
525 tlb_start_vma(tlb, vma);
526 walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
527 tlb_end_vma(tlb, vma);
528}
529
530static inline bool can_do_pageout(struct vm_area_struct *vma)
531{
532 if (vma_is_anonymous(vma))
533 return true;
534 if (!vma->vm_file)
535 return false;
536
537
538
539
540
541
542 return inode_owner_or_capable(&init_user_ns,
543 file_inode(vma->vm_file)) ||
544 file_permission(vma->vm_file, MAY_WRITE) == 0;
545}
546
547static long madvise_pageout(struct vm_area_struct *vma,
548 struct vm_area_struct **prev,
549 unsigned long start_addr, unsigned long end_addr)
550{
551 struct mm_struct *mm = vma->vm_mm;
552 struct mmu_gather tlb;
553
554 *prev = vma;
555 if (!can_madv_lru_vma(vma))
556 return -EINVAL;
557
558 if (!can_do_pageout(vma))
559 return 0;
560
561 lru_add_drain();
562 tlb_gather_mmu(&tlb, mm);
563 madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
564 tlb_finish_mmu(&tlb);
565
566 return 0;
567}
568
569static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
570 unsigned long end, struct mm_walk *walk)
571
572{
573 struct mmu_gather *tlb = walk->private;
574 struct mm_struct *mm = tlb->mm;
575 struct vm_area_struct *vma = walk->vma;
576 spinlock_t *ptl;
577 pte_t *orig_pte, *pte, ptent;
578 struct page *page;
579 int nr_swap = 0;
580 unsigned long next;
581
582 next = pmd_addr_end(addr, end);
583 if (pmd_trans_huge(*pmd))
584 if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
585 goto next;
586
587 if (pmd_trans_unstable(pmd))
588 return 0;
589
590 tlb_change_page_size(tlb, PAGE_SIZE);
591 orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
592 flush_tlb_batched_pending(mm);
593 arch_enter_lazy_mmu_mode();
594 for (; addr != end; pte++, addr += PAGE_SIZE) {
595 ptent = *pte;
596
597 if (pte_none(ptent))
598 continue;
599
600
601
602
603
604 if (!pte_present(ptent)) {
605 swp_entry_t entry;
606
607 entry = pte_to_swp_entry(ptent);
608 if (non_swap_entry(entry))
609 continue;
610 nr_swap--;
611 free_swap_and_cache(entry);
612 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
613 continue;
614 }
615
616 page = vm_normal_page(vma, addr, ptent);
617 if (!page)
618 continue;
619
620
621
622
623
624
625 if (PageTransCompound(page)) {
626 if (page_mapcount(page) != 1)
627 goto out;
628 get_page(page);
629 if (!trylock_page(page)) {
630 put_page(page);
631 goto out;
632 }
633 pte_unmap_unlock(orig_pte, ptl);
634 if (split_huge_page(page)) {
635 unlock_page(page);
636 put_page(page);
637 pte_offset_map_lock(mm, pmd, addr, &ptl);
638 goto out;
639 }
640 unlock_page(page);
641 put_page(page);
642 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
643 pte--;
644 addr -= PAGE_SIZE;
645 continue;
646 }
647
648 VM_BUG_ON_PAGE(PageTransCompound(page), page);
649
650 if (PageSwapCache(page) || PageDirty(page)) {
651 if (!trylock_page(page))
652 continue;
653
654
655
656
657 if (page_mapcount(page) != 1) {
658 unlock_page(page);
659 continue;
660 }
661
662 if (PageSwapCache(page) && !try_to_free_swap(page)) {
663 unlock_page(page);
664 continue;
665 }
666
667 ClearPageDirty(page);
668 unlock_page(page);
669 }
670
671 if (pte_young(ptent) || pte_dirty(ptent)) {
672
673
674
675
676
677
678 ptent = ptep_get_and_clear_full(mm, addr, pte,
679 tlb->fullmm);
680
681 ptent = pte_mkold(ptent);
682 ptent = pte_mkclean(ptent);
683 set_pte_at(mm, addr, pte, ptent);
684 tlb_remove_tlb_entry(tlb, pte, addr);
685 }
686 mark_page_lazyfree(page);
687 }
688out:
689 if (nr_swap) {
690 if (current->mm == mm)
691 sync_mm_rss(mm);
692
693 add_mm_counter(mm, MM_SWAPENTS, nr_swap);
694 }
695 arch_leave_lazy_mmu_mode();
696 pte_unmap_unlock(orig_pte, ptl);
697 cond_resched();
698next:
699 return 0;
700}
701
702static const struct mm_walk_ops madvise_free_walk_ops = {
703 .pmd_entry = madvise_free_pte_range,
704};
705
706static int madvise_free_single_vma(struct vm_area_struct *vma,
707 unsigned long start_addr, unsigned long end_addr)
708{
709 struct mm_struct *mm = vma->vm_mm;
710 struct mmu_notifier_range range;
711 struct mmu_gather tlb;
712
713
714 if (!vma_is_anonymous(vma))
715 return -EINVAL;
716
717 range.start = max(vma->vm_start, start_addr);
718 if (range.start >= vma->vm_end)
719 return -EINVAL;
720 range.end = min(vma->vm_end, end_addr);
721 if (range.end <= vma->vm_start)
722 return -EINVAL;
723 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
724 range.start, range.end);
725
726 lru_add_drain();
727 tlb_gather_mmu(&tlb, mm);
728 update_hiwater_rss(mm);
729
730 mmu_notifier_invalidate_range_start(&range);
731 tlb_start_vma(&tlb, vma);
732 walk_page_range(vma->vm_mm, range.start, range.end,
733 &madvise_free_walk_ops, &tlb);
734 tlb_end_vma(&tlb, vma);
735 mmu_notifier_invalidate_range_end(&range);
736 tlb_finish_mmu(&tlb);
737
738 return 0;
739}
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
761 unsigned long start, unsigned long end)
762{
763 zap_page_range(vma, start, end - start);
764 return 0;
765}
766
767static long madvise_dontneed_free(struct vm_area_struct *vma,
768 struct vm_area_struct **prev,
769 unsigned long start, unsigned long end,
770 int behavior)
771{
772 struct mm_struct *mm = vma->vm_mm;
773
774 *prev = vma;
775 if (!can_madv_lru_vma(vma))
776 return -EINVAL;
777
778 if (!userfaultfd_remove(vma, start, end)) {
779 *prev = NULL;
780
781 mmap_read_lock(mm);
782 vma = find_vma(mm, start);
783 if (!vma)
784 return -ENOMEM;
785 if (start < vma->vm_start) {
786
787
788
789
790
791
792
793
794
795 return -ENOMEM;
796 }
797 if (!can_madv_lru_vma(vma))
798 return -EINVAL;
799 if (end > vma->vm_end) {
800
801
802
803
804
805
806
807
808
809
810
811
812 end = vma->vm_end;
813 }
814 VM_WARN_ON(start >= end);
815 }
816
817 if (behavior == MADV_DONTNEED)
818 return madvise_dontneed_single_vma(vma, start, end);
819 else if (behavior == MADV_FREE)
820 return madvise_free_single_vma(vma, start, end);
821 else
822 return -EINVAL;
823}
824
825
826
827
828
829static long madvise_remove(struct vm_area_struct *vma,
830 struct vm_area_struct **prev,
831 unsigned long start, unsigned long end)
832{
833 loff_t offset;
834 int error;
835 struct file *f;
836 struct mm_struct *mm = vma->vm_mm;
837
838 *prev = NULL;
839
840 if (vma->vm_flags & VM_LOCKED)
841 return -EINVAL;
842
843 f = vma->vm_file;
844
845 if (!f || !f->f_mapping || !f->f_mapping->host) {
846 return -EINVAL;
847 }
848
849 if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
850 return -EACCES;
851
852 offset = (loff_t)(start - vma->vm_start)
853 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
854
855
856
857
858
859
860
861 get_file(f);
862 if (userfaultfd_remove(vma, start, end)) {
863
864 mmap_read_unlock(mm);
865 }
866 error = vfs_fallocate(f,
867 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
868 offset, end - start);
869 fput(f);
870 mmap_read_lock(mm);
871 return error;
872}
873
874#ifdef CONFIG_MEMORY_FAILURE
875
876
877
878static int madvise_inject_error(int behavior,
879 unsigned long start, unsigned long end)
880{
881 unsigned long size;
882
883 if (!capable(CAP_SYS_ADMIN))
884 return -EPERM;
885
886
887 for (; start < end; start += size) {
888 unsigned long pfn;
889 struct page *page;
890 int ret;
891
892 ret = get_user_pages_fast(start, 1, 0, &page);
893 if (ret != 1)
894 return ret;
895 pfn = page_to_pfn(page);
896
897
898
899
900
901
902 size = page_size(compound_head(page));
903
904 if (behavior == MADV_SOFT_OFFLINE) {
905 pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
906 pfn, start);
907 ret = soft_offline_page(pfn, MF_COUNT_INCREASED);
908 } else {
909 pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
910 pfn, start);
911 ret = memory_failure(pfn, MF_COUNT_INCREASED);
912 }
913
914 if (ret)
915 return ret;
916 }
917
918 return 0;
919}
920#endif
921
922static long
923madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
924 unsigned long start, unsigned long end, int behavior)
925{
926 switch (behavior) {
927 case MADV_REMOVE:
928 return madvise_remove(vma, prev, start, end);
929 case MADV_WILLNEED:
930 return madvise_willneed(vma, prev, start, end);
931 case MADV_COLD:
932 return madvise_cold(vma, prev, start, end);
933 case MADV_PAGEOUT:
934 return madvise_pageout(vma, prev, start, end);
935 case MADV_FREE:
936 case MADV_DONTNEED:
937 return madvise_dontneed_free(vma, prev, start, end, behavior);
938 default:
939 return madvise_behavior(vma, prev, start, end, behavior);
940 }
941}
942
943static bool
944madvise_behavior_valid(int behavior)
945{
946 switch (behavior) {
947 case MADV_DOFORK:
948 case MADV_DONTFORK:
949 case MADV_NORMAL:
950 case MADV_SEQUENTIAL:
951 case MADV_RANDOM:
952 case MADV_REMOVE:
953 case MADV_WILLNEED:
954 case MADV_DONTNEED:
955 case MADV_FREE:
956 case MADV_COLD:
957 case MADV_PAGEOUT:
958#ifdef CONFIG_KSM
959 case MADV_MERGEABLE:
960 case MADV_UNMERGEABLE:
961#endif
962#ifdef CONFIG_TRANSPARENT_HUGEPAGE
963 case MADV_HUGEPAGE:
964 case MADV_NOHUGEPAGE:
965#endif
966 case MADV_DONTDUMP:
967 case MADV_DODUMP:
968 case MADV_WIPEONFORK:
969 case MADV_KEEPONFORK:
970#ifdef CONFIG_MEMORY_FAILURE
971 case MADV_SOFT_OFFLINE:
972 case MADV_HWPOISON:
973#endif
974 return true;
975
976 default:
977 return false;
978 }
979}
980
981static bool
982process_madvise_behavior_valid(int behavior)
983{
984 switch (behavior) {
985 case MADV_COLD:
986 case MADV_PAGEOUT:
987 return true;
988 default:
989 return false;
990 }
991}
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
1060{
1061 unsigned long end, tmp;
1062 struct vm_area_struct *vma, *prev;
1063 int unmapped_error = 0;
1064 int error = -EINVAL;
1065 int write;
1066 size_t len;
1067 struct blk_plug plug;
1068
1069 start = untagged_addr(start);
1070
1071 if (!madvise_behavior_valid(behavior))
1072 return error;
1073
1074 if (!PAGE_ALIGNED(start))
1075 return error;
1076 len = PAGE_ALIGN(len_in);
1077
1078
1079 if (len_in && !len)
1080 return error;
1081
1082 end = start + len;
1083 if (end < start)
1084 return error;
1085
1086 error = 0;
1087 if (end == start)
1088 return error;
1089
1090#ifdef CONFIG_MEMORY_FAILURE
1091 if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
1092 return madvise_inject_error(behavior, start, start + len_in);
1093#endif
1094
1095 write = madvise_need_mmap_write(behavior);
1096 if (write) {
1097 if (mmap_write_lock_killable(mm))
1098 return -EINTR;
1099 } else {
1100 mmap_read_lock(mm);
1101 }
1102
1103
1104
1105
1106
1107
1108 vma = find_vma_prev(mm, start, &prev);
1109 if (vma && start > vma->vm_start)
1110 prev = vma;
1111
1112 blk_start_plug(&plug);
1113 for (;;) {
1114
1115 error = -ENOMEM;
1116 if (!vma)
1117 goto out;
1118
1119
1120 if (start < vma->vm_start) {
1121 unmapped_error = -ENOMEM;
1122 start = vma->vm_start;
1123 if (start >= end)
1124 goto out;
1125 }
1126
1127
1128 tmp = vma->vm_end;
1129 if (end < tmp)
1130 tmp = end;
1131
1132
1133 error = madvise_vma(vma, &prev, start, tmp, behavior);
1134 if (error)
1135 goto out;
1136 start = tmp;
1137 if (prev && start < prev->vm_end)
1138 start = prev->vm_end;
1139 error = unmapped_error;
1140 if (start >= end)
1141 goto out;
1142 if (prev)
1143 vma = prev->vm_next;
1144 else
1145 vma = find_vma(mm, start);
1146 }
1147out:
1148 blk_finish_plug(&plug);
1149 if (write)
1150 mmap_write_unlock(mm);
1151 else
1152 mmap_read_unlock(mm);
1153
1154 return error;
1155}
1156
1157SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
1158{
1159 return do_madvise(current->mm, start, len_in, behavior);
1160}
1161
1162SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
1163 size_t, vlen, int, behavior, unsigned int, flags)
1164{
1165 ssize_t ret;
1166 struct iovec iovstack[UIO_FASTIOV], iovec;
1167 struct iovec *iov = iovstack;
1168 struct iov_iter iter;
1169 struct pid *pid;
1170 struct task_struct *task;
1171 struct mm_struct *mm;
1172 size_t total_len;
1173 unsigned int f_flags;
1174
1175 if (flags != 0) {
1176 ret = -EINVAL;
1177 goto out;
1178 }
1179
1180 ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
1181 if (ret < 0)
1182 goto out;
1183
1184 pid = pidfd_get_pid(pidfd, &f_flags);
1185 if (IS_ERR(pid)) {
1186 ret = PTR_ERR(pid);
1187 goto free_iov;
1188 }
1189
1190 task = get_pid_task(pid, PIDTYPE_PID);
1191 if (!task) {
1192 ret = -ESRCH;
1193 goto put_pid;
1194 }
1195
1196 if (!process_madvise_behavior_valid(behavior)) {
1197 ret = -EINVAL;
1198 goto release_task;
1199 }
1200
1201
1202 mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
1203 if (IS_ERR_OR_NULL(mm)) {
1204 ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
1205 goto release_task;
1206 }
1207
1208
1209
1210
1211
1212 if (!capable(CAP_SYS_NICE)) {
1213 ret = -EPERM;
1214 goto release_mm;
1215 }
1216
1217 total_len = iov_iter_count(&iter);
1218
1219 while (iov_iter_count(&iter)) {
1220 iovec = iov_iter_iovec(&iter);
1221 ret = do_madvise(mm, (unsigned long)iovec.iov_base,
1222 iovec.iov_len, behavior);
1223 if (ret < 0)
1224 break;
1225 iov_iter_advance(&iter, iovec.iov_len);
1226 }
1227
1228 if (ret == 0)
1229 ret = total_len - iov_iter_count(&iter);
1230
1231release_mm:
1232 mmput(mm);
1233release_task:
1234 put_task_struct(task);
1235put_pid:
1236 put_pid(pid);
1237free_iov:
1238 kfree(iov);
1239out:
1240 return ret;
1241}
1242