1
2
3
4
5
6
7
8
9#include <linux/mman.h>
10#include <linux/pagemap.h>
11#include <linux/syscalls.h>
12#include <linux/mempolicy.h>
13#include <linux/page-isolation.h>
14#include <linux/page_idle.h>
15#include <linux/userfaultfd_k.h>
16#include <linux/hugetlb.h>
17#include <linux/falloc.h>
18#include <linux/fadvise.h>
19#include <linux/sched.h>
20#include <linux/ksm.h>
21#include <linux/fs.h>
22#include <linux/file.h>
23#include <linux/blkdev.h>
24#include <linux/backing-dev.h>
25#include <linux/pagewalk.h>
26#include <linux/swap.h>
27#include <linux/swapops.h>
28#include <linux/shmem_fs.h>
29#include <linux/mmu_notifier.h>
30
31#include <asm/tlb.h>
32
33#include "internal.h"
34
35struct madvise_walk_private {
36 struct mmu_gather *tlb;
37 bool pageout;
38};
39
40
41
42
43
44
45static int madvise_need_mmap_write(int behavior)
46{
47 switch (behavior) {
48 case MADV_REMOVE:
49 case MADV_WILLNEED:
50 case MADV_DONTNEED:
51 case MADV_COLD:
52 case MADV_PAGEOUT:
53 case MADV_FREE:
54 return 0;
55 default:
56
57 return 1;
58 }
59}
60
61
62
63
64
65static long madvise_behavior(struct vm_area_struct *vma,
66 struct vm_area_struct **prev,
67 unsigned long start, unsigned long end, int behavior)
68{
69 struct mm_struct *mm = vma->vm_mm;
70 int error = 0;
71 pgoff_t pgoff;
72 unsigned long new_flags = vma->vm_flags;
73
74 switch (behavior) {
75 case MADV_NORMAL:
76 new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
77 break;
78 case MADV_SEQUENTIAL:
79 new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
80 break;
81 case MADV_RANDOM:
82 new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
83 break;
84 case MADV_DONTFORK:
85 new_flags |= VM_DONTCOPY;
86 break;
87 case MADV_DOFORK:
88 if (vma->vm_flags & VM_IO) {
89 error = -EINVAL;
90 goto out;
91 }
92 new_flags &= ~VM_DONTCOPY;
93 break;
94 case MADV_WIPEONFORK:
95
96 if (vma->vm_file || vma->vm_flags & VM_SHARED) {
97 error = -EINVAL;
98 goto out;
99 }
100 new_flags |= VM_WIPEONFORK;
101 break;
102 case MADV_KEEPONFORK:
103 new_flags &= ~VM_WIPEONFORK;
104 break;
105 case MADV_DONTDUMP:
106 new_flags |= VM_DONTDUMP;
107 break;
108 case MADV_DODUMP:
109 if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) {
110 error = -EINVAL;
111 goto out;
112 }
113 new_flags &= ~VM_DONTDUMP;
114 break;
115 case MADV_MERGEABLE:
116 case MADV_UNMERGEABLE:
117 error = ksm_madvise(vma, start, end, behavior, &new_flags);
118 if (error)
119 goto out_convert_errno;
120 break;
121 case MADV_HUGEPAGE:
122 case MADV_NOHUGEPAGE:
123 error = hugepage_madvise(vma, &new_flags, behavior);
124 if (error)
125 goto out_convert_errno;
126 break;
127 }
128
129 if (new_flags == vma->vm_flags) {
130 *prev = vma;
131 goto out;
132 }
133
134 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
135 *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
136 vma->vm_file, pgoff, vma_policy(vma),
137 vma->vm_userfaultfd_ctx);
138 if (*prev) {
139 vma = *prev;
140 goto success;
141 }
142
143 *prev = vma;
144
145 if (start != vma->vm_start) {
146 if (unlikely(mm->map_count >= sysctl_max_map_count)) {
147 error = -ENOMEM;
148 goto out;
149 }
150 error = __split_vma(mm, vma, start, 1);
151 if (error)
152 goto out_convert_errno;
153 }
154
155 if (end != vma->vm_end) {
156 if (unlikely(mm->map_count >= sysctl_max_map_count)) {
157 error = -ENOMEM;
158 goto out;
159 }
160 error = __split_vma(mm, vma, end, 0);
161 if (error)
162 goto out_convert_errno;
163 }
164
165success:
166
167
168
169 vma->vm_flags = new_flags;
170
171out_convert_errno:
172
173
174
175
176 if (error == -ENOMEM)
177 error = -EAGAIN;
178out:
179 return error;
180}
181
182#ifdef CONFIG_SWAP
183static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
184 unsigned long end, struct mm_walk *walk)
185{
186 pte_t *orig_pte;
187 struct vm_area_struct *vma = walk->private;
188 unsigned long index;
189
190 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
191 return 0;
192
193 for (index = start; index != end; index += PAGE_SIZE) {
194 pte_t pte;
195 swp_entry_t entry;
196 struct page *page;
197 spinlock_t *ptl;
198
199 orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
200 pte = *(orig_pte + ((index - start) / PAGE_SIZE));
201 pte_unmap_unlock(orig_pte, ptl);
202
203 if (pte_present(pte) || pte_none(pte))
204 continue;
205 entry = pte_to_swp_entry(pte);
206 if (unlikely(non_swap_entry(entry)))
207 continue;
208
209 page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
210 vma, index, false);
211 if (page)
212 put_page(page);
213 }
214
215 return 0;
216}
217
218static const struct mm_walk_ops swapin_walk_ops = {
219 .pmd_entry = swapin_walk_pmd_entry,
220};
221
222static void force_shm_swapin_readahead(struct vm_area_struct *vma,
223 unsigned long start, unsigned long end,
224 struct address_space *mapping)
225{
226 pgoff_t index;
227 struct page *page;
228 swp_entry_t swap;
229
230 for (; start < end; start += PAGE_SIZE) {
231 index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
232
233 page = find_get_entry(mapping, index);
234 if (!xa_is_value(page)) {
235 if (page)
236 put_page(page);
237 continue;
238 }
239 swap = radix_to_swp_entry(page);
240 page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
241 NULL, 0, false);
242 if (page)
243 put_page(page);
244 }
245
246 lru_add_drain();
247}
248#endif
249
250
251
252
253static long madvise_willneed(struct vm_area_struct *vma,
254 struct vm_area_struct **prev,
255 unsigned long start, unsigned long end)
256{
257 struct file *file = vma->vm_file;
258 loff_t offset;
259
260 *prev = vma;
261#ifdef CONFIG_SWAP
262 if (!file) {
263 walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma);
264 lru_add_drain();
265 return 0;
266 }
267
268 if (shmem_mapping(file->f_mapping)) {
269 force_shm_swapin_readahead(vma, start, end,
270 file->f_mapping);
271 return 0;
272 }
273#else
274 if (!file)
275 return -EBADF;
276#endif
277
278 if (IS_DAX(file_inode(file))) {
279
280 return 0;
281 }
282
283
284
285
286
287
288
289 *prev = NULL;
290 get_file(file);
291 up_read(¤t->mm->mmap_sem);
292 offset = (loff_t)(start - vma->vm_start)
293 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
294 vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
295 fput(file);
296 down_read(¤t->mm->mmap_sem);
297 return 0;
298}
299
300static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
301 unsigned long addr, unsigned long end,
302 struct mm_walk *walk)
303{
304 struct madvise_walk_private *private = walk->private;
305 struct mmu_gather *tlb = private->tlb;
306 bool pageout = private->pageout;
307 struct mm_struct *mm = tlb->mm;
308 struct vm_area_struct *vma = walk->vma;
309 pte_t *orig_pte, *pte, ptent;
310 spinlock_t *ptl;
311 struct page *page = NULL;
312 LIST_HEAD(page_list);
313
314 if (fatal_signal_pending(current))
315 return -EINTR;
316
317#ifdef CONFIG_TRANSPARENT_HUGEPAGE
318 if (pmd_trans_huge(*pmd)) {
319 pmd_t orig_pmd;
320 unsigned long next = pmd_addr_end(addr, end);
321
322 tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
323 ptl = pmd_trans_huge_lock(pmd, vma);
324 if (!ptl)
325 return 0;
326
327 orig_pmd = *pmd;
328 if (is_huge_zero_pmd(orig_pmd))
329 goto huge_unlock;
330
331 if (unlikely(!pmd_present(orig_pmd))) {
332 VM_BUG_ON(thp_migration_supported() &&
333 !is_pmd_migration_entry(orig_pmd));
334 goto huge_unlock;
335 }
336
337 page = pmd_page(orig_pmd);
338 if (next - addr != HPAGE_PMD_SIZE) {
339 int err;
340
341 if (page_mapcount(page) != 1)
342 goto huge_unlock;
343
344 get_page(page);
345 spin_unlock(ptl);
346 lock_page(page);
347 err = split_huge_page(page);
348 unlock_page(page);
349 put_page(page);
350 if (!err)
351 goto regular_page;
352 return 0;
353 }
354
355 if (pmd_young(orig_pmd)) {
356 pmdp_invalidate(vma, addr, pmd);
357 orig_pmd = pmd_mkold(orig_pmd);
358
359 set_pmd_at(mm, addr, pmd, orig_pmd);
360 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
361 }
362
363 ClearPageReferenced(page);
364 test_and_clear_page_young(page);
365 if (pageout) {
366 if (!isolate_lru_page(page)) {
367 if (PageUnevictable(page))
368 putback_lru_page(page);
369 else
370 list_add(&page->lru, &page_list);
371 }
372 } else
373 deactivate_page(page);
374huge_unlock:
375 spin_unlock(ptl);
376 if (pageout)
377 reclaim_pages(&page_list);
378 return 0;
379 }
380
381 if (pmd_trans_unstable(pmd))
382 return 0;
383regular_page:
384#endif
385 tlb_change_page_size(tlb, PAGE_SIZE);
386 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
387 flush_tlb_batched_pending(mm);
388 arch_enter_lazy_mmu_mode();
389 for (; addr < end; pte++, addr += PAGE_SIZE) {
390 ptent = *pte;
391
392 if (pte_none(ptent))
393 continue;
394
395 if (!pte_present(ptent))
396 continue;
397
398 page = vm_normal_page(vma, addr, ptent);
399 if (!page)
400 continue;
401
402
403
404
405
406 if (PageTransCompound(page)) {
407 if (page_mapcount(page) != 1)
408 break;
409 get_page(page);
410 if (!trylock_page(page)) {
411 put_page(page);
412 break;
413 }
414 pte_unmap_unlock(orig_pte, ptl);
415 if (split_huge_page(page)) {
416 unlock_page(page);
417 put_page(page);
418 pte_offset_map_lock(mm, pmd, addr, &ptl);
419 break;
420 }
421 unlock_page(page);
422 put_page(page);
423 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
424 pte--;
425 addr -= PAGE_SIZE;
426 continue;
427 }
428
429 VM_BUG_ON_PAGE(PageTransCompound(page), page);
430
431 if (pte_young(ptent)) {
432 ptent = ptep_get_and_clear_full(mm, addr, pte,
433 tlb->fullmm);
434 ptent = pte_mkold(ptent);
435 set_pte_at(mm, addr, pte, ptent);
436 tlb_remove_tlb_entry(tlb, pte, addr);
437 }
438
439
440
441
442
443
444
445 ClearPageReferenced(page);
446 test_and_clear_page_young(page);
447 if (pageout) {
448 if (!isolate_lru_page(page)) {
449 if (PageUnevictable(page))
450 putback_lru_page(page);
451 else
452 list_add(&page->lru, &page_list);
453 }
454 } else
455 deactivate_page(page);
456 }
457
458 arch_leave_lazy_mmu_mode();
459 pte_unmap_unlock(orig_pte, ptl);
460 if (pageout)
461 reclaim_pages(&page_list);
462 cond_resched();
463
464 return 0;
465}
466
467static const struct mm_walk_ops cold_walk_ops = {
468 .pmd_entry = madvise_cold_or_pageout_pte_range,
469};
470
471static void madvise_cold_page_range(struct mmu_gather *tlb,
472 struct vm_area_struct *vma,
473 unsigned long addr, unsigned long end)
474{
475 struct madvise_walk_private walk_private = {
476 .pageout = false,
477 .tlb = tlb,
478 };
479
480 tlb_start_vma(tlb, vma);
481 walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
482 tlb_end_vma(tlb, vma);
483}
484
485static long madvise_cold(struct vm_area_struct *vma,
486 struct vm_area_struct **prev,
487 unsigned long start_addr, unsigned long end_addr)
488{
489 struct mm_struct *mm = vma->vm_mm;
490 struct mmu_gather tlb;
491
492 *prev = vma;
493 if (!can_madv_lru_vma(vma))
494 return -EINVAL;
495
496 lru_add_drain();
497 tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
498 madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
499 tlb_finish_mmu(&tlb, start_addr, end_addr);
500
501 return 0;
502}
503
504static void madvise_pageout_page_range(struct mmu_gather *tlb,
505 struct vm_area_struct *vma,
506 unsigned long addr, unsigned long end)
507{
508 struct madvise_walk_private walk_private = {
509 .pageout = true,
510 .tlb = tlb,
511 };
512
513 tlb_start_vma(tlb, vma);
514 walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
515 tlb_end_vma(tlb, vma);
516}
517
518static inline bool can_do_pageout(struct vm_area_struct *vma)
519{
520 if (vma_is_anonymous(vma))
521 return true;
522 if (!vma->vm_file)
523 return false;
524
525
526
527
528
529
530 return inode_owner_or_capable(file_inode(vma->vm_file)) ||
531 inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0;
532}
533
534static long madvise_pageout(struct vm_area_struct *vma,
535 struct vm_area_struct **prev,
536 unsigned long start_addr, unsigned long end_addr)
537{
538 struct mm_struct *mm = vma->vm_mm;
539 struct mmu_gather tlb;
540
541 *prev = vma;
542 if (!can_madv_lru_vma(vma))
543 return -EINVAL;
544
545 if (!can_do_pageout(vma))
546 return 0;
547
548 lru_add_drain();
549 tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
550 madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
551 tlb_finish_mmu(&tlb, start_addr, end_addr);
552
553 return 0;
554}
555
556static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
557 unsigned long end, struct mm_walk *walk)
558
559{
560 struct mmu_gather *tlb = walk->private;
561 struct mm_struct *mm = tlb->mm;
562 struct vm_area_struct *vma = walk->vma;
563 spinlock_t *ptl;
564 pte_t *orig_pte, *pte, ptent;
565 struct page *page;
566 int nr_swap = 0;
567 unsigned long next;
568
569 next = pmd_addr_end(addr, end);
570 if (pmd_trans_huge(*pmd))
571 if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
572 goto next;
573
574 if (pmd_trans_unstable(pmd))
575 return 0;
576
577 tlb_change_page_size(tlb, PAGE_SIZE);
578 orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
579 flush_tlb_batched_pending(mm);
580 arch_enter_lazy_mmu_mode();
581 for (; addr != end; pte++, addr += PAGE_SIZE) {
582 ptent = *pte;
583
584 if (pte_none(ptent))
585 continue;
586
587
588
589
590
591 if (!pte_present(ptent)) {
592 swp_entry_t entry;
593
594 entry = pte_to_swp_entry(ptent);
595 if (non_swap_entry(entry))
596 continue;
597 nr_swap--;
598 free_swap_and_cache(entry);
599 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
600 continue;
601 }
602
603 page = vm_normal_page(vma, addr, ptent);
604 if (!page)
605 continue;
606
607
608
609
610
611
612 if (PageTransCompound(page)) {
613 if (page_mapcount(page) != 1)
614 goto out;
615 get_page(page);
616 if (!trylock_page(page)) {
617 put_page(page);
618 goto out;
619 }
620 pte_unmap_unlock(orig_pte, ptl);
621 if (split_huge_page(page)) {
622 unlock_page(page);
623 put_page(page);
624 pte_offset_map_lock(mm, pmd, addr, &ptl);
625 goto out;
626 }
627 unlock_page(page);
628 put_page(page);
629 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
630 pte--;
631 addr -= PAGE_SIZE;
632 continue;
633 }
634
635 VM_BUG_ON_PAGE(PageTransCompound(page), page);
636
637 if (PageSwapCache(page) || PageDirty(page)) {
638 if (!trylock_page(page))
639 continue;
640
641
642
643
644 if (page_mapcount(page) != 1) {
645 unlock_page(page);
646 continue;
647 }
648
649 if (PageSwapCache(page) && !try_to_free_swap(page)) {
650 unlock_page(page);
651 continue;
652 }
653
654 ClearPageDirty(page);
655 unlock_page(page);
656 }
657
658 if (pte_young(ptent) || pte_dirty(ptent)) {
659
660
661
662
663
664
665 ptent = ptep_get_and_clear_full(mm, addr, pte,
666 tlb->fullmm);
667
668 ptent = pte_mkold(ptent);
669 ptent = pte_mkclean(ptent);
670 set_pte_at(mm, addr, pte, ptent);
671 tlb_remove_tlb_entry(tlb, pte, addr);
672 }
673 mark_page_lazyfree(page);
674 }
675out:
676 if (nr_swap) {
677 if (current->mm == mm)
678 sync_mm_rss(mm);
679
680 add_mm_counter(mm, MM_SWAPENTS, nr_swap);
681 }
682 arch_leave_lazy_mmu_mode();
683 pte_unmap_unlock(orig_pte, ptl);
684 cond_resched();
685next:
686 return 0;
687}
688
689static const struct mm_walk_ops madvise_free_walk_ops = {
690 .pmd_entry = madvise_free_pte_range,
691};
692
693static int madvise_free_single_vma(struct vm_area_struct *vma,
694 unsigned long start_addr, unsigned long end_addr)
695{
696 struct mm_struct *mm = vma->vm_mm;
697 struct mmu_notifier_range range;
698 struct mmu_gather tlb;
699
700
701 if (!vma_is_anonymous(vma))
702 return -EINVAL;
703
704 range.start = max(vma->vm_start, start_addr);
705 if (range.start >= vma->vm_end)
706 return -EINVAL;
707 range.end = min(vma->vm_end, end_addr);
708 if (range.end <= vma->vm_start)
709 return -EINVAL;
710 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
711 range.start, range.end);
712
713 lru_add_drain();
714 tlb_gather_mmu(&tlb, mm, range.start, range.end);
715 update_hiwater_rss(mm);
716
717 mmu_notifier_invalidate_range_start(&range);
718 tlb_start_vma(&tlb, vma);
719 walk_page_range(vma->vm_mm, range.start, range.end,
720 &madvise_free_walk_ops, &tlb);
721 tlb_end_vma(&tlb, vma);
722 mmu_notifier_invalidate_range_end(&range);
723 tlb_finish_mmu(&tlb, range.start, range.end);
724
725 return 0;
726}
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
748 unsigned long start, unsigned long end)
749{
750 zap_page_range(vma, start, end - start);
751 return 0;
752}
753
754static long madvise_dontneed_free(struct vm_area_struct *vma,
755 struct vm_area_struct **prev,
756 unsigned long start, unsigned long end,
757 int behavior)
758{
759 *prev = vma;
760 if (!can_madv_lru_vma(vma))
761 return -EINVAL;
762
763 if (!userfaultfd_remove(vma, start, end)) {
764 *prev = NULL;
765
766 down_read(¤t->mm->mmap_sem);
767 vma = find_vma(current->mm, start);
768 if (!vma)
769 return -ENOMEM;
770 if (start < vma->vm_start) {
771
772
773
774
775
776
777
778
779
780 return -ENOMEM;
781 }
782 if (!can_madv_lru_vma(vma))
783 return -EINVAL;
784 if (end > vma->vm_end) {
785
786
787
788
789
790
791
792
793
794
795
796
797 end = vma->vm_end;
798 }
799 VM_WARN_ON(start >= end);
800 }
801
802 if (behavior == MADV_DONTNEED)
803 return madvise_dontneed_single_vma(vma, start, end);
804 else if (behavior == MADV_FREE)
805 return madvise_free_single_vma(vma, start, end);
806 else
807 return -EINVAL;
808}
809
810
811
812
813
814static long madvise_remove(struct vm_area_struct *vma,
815 struct vm_area_struct **prev,
816 unsigned long start, unsigned long end)
817{
818 loff_t offset;
819 int error;
820 struct file *f;
821
822 *prev = NULL;
823
824 if (vma->vm_flags & VM_LOCKED)
825 return -EINVAL;
826
827 f = vma->vm_file;
828
829 if (!f || !f->f_mapping || !f->f_mapping->host) {
830 return -EINVAL;
831 }
832
833 if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
834 return -EACCES;
835
836 offset = (loff_t)(start - vma->vm_start)
837 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
838
839
840
841
842
843
844
845 get_file(f);
846 if (userfaultfd_remove(vma, start, end)) {
847
848 up_read(¤t->mm->mmap_sem);
849 }
850 error = vfs_fallocate(f,
851 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
852 offset, end - start);
853 fput(f);
854 down_read(¤t->mm->mmap_sem);
855 return error;
856}
857
858#ifdef CONFIG_MEMORY_FAILURE
859
860
861
862static int madvise_inject_error(int behavior,
863 unsigned long start, unsigned long end)
864{
865 struct page *page;
866 struct zone *zone;
867 unsigned int order;
868
869 if (!capable(CAP_SYS_ADMIN))
870 return -EPERM;
871
872
873 for (; start < end; start += PAGE_SIZE << order) {
874 unsigned long pfn;
875 int ret;
876
877 ret = get_user_pages_fast(start, 1, 0, &page);
878 if (ret != 1)
879 return ret;
880 pfn = page_to_pfn(page);
881
882
883
884
885
886
887 order = compound_order(compound_head(page));
888
889 if (PageHWPoison(page)) {
890 put_page(page);
891 continue;
892 }
893
894 if (behavior == MADV_SOFT_OFFLINE) {
895 pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
896 pfn, start);
897
898 ret = soft_offline_page(page, MF_COUNT_INCREASED);
899 if (ret)
900 return ret;
901 continue;
902 }
903
904 pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
905 pfn, start);
906
907
908
909
910
911
912
913 put_page(page);
914 ret = memory_failure(pfn, 0);
915 if (ret)
916 return ret;
917 }
918
919
920 for_each_populated_zone(zone)
921 drain_all_pages(zone);
922
923 return 0;
924}
925#endif
926
927static long
928madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
929 unsigned long start, unsigned long end, int behavior)
930{
931 switch (behavior) {
932 case MADV_REMOVE:
933 return madvise_remove(vma, prev, start, end);
934 case MADV_WILLNEED:
935 return madvise_willneed(vma, prev, start, end);
936 case MADV_COLD:
937 return madvise_cold(vma, prev, start, end);
938 case MADV_PAGEOUT:
939 return madvise_pageout(vma, prev, start, end);
940 case MADV_FREE:
941 case MADV_DONTNEED:
942 return madvise_dontneed_free(vma, prev, start, end, behavior);
943 default:
944 return madvise_behavior(vma, prev, start, end, behavior);
945 }
946}
947
948static bool
949madvise_behavior_valid(int behavior)
950{
951 switch (behavior) {
952 case MADV_DOFORK:
953 case MADV_DONTFORK:
954 case MADV_NORMAL:
955 case MADV_SEQUENTIAL:
956 case MADV_RANDOM:
957 case MADV_REMOVE:
958 case MADV_WILLNEED:
959 case MADV_DONTNEED:
960 case MADV_FREE:
961 case MADV_COLD:
962 case MADV_PAGEOUT:
963#ifdef CONFIG_KSM
964 case MADV_MERGEABLE:
965 case MADV_UNMERGEABLE:
966#endif
967#ifdef CONFIG_TRANSPARENT_HUGEPAGE
968 case MADV_HUGEPAGE:
969 case MADV_NOHUGEPAGE:
970#endif
971 case MADV_DONTDUMP:
972 case MADV_DODUMP:
973 case MADV_WIPEONFORK:
974 case MADV_KEEPONFORK:
975#ifdef CONFIG_MEMORY_FAILURE
976 case MADV_SOFT_OFFLINE:
977 case MADV_HWPOISON:
978#endif
979 return true;
980
981 default:
982 return false;
983 }
984}
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
1048{
1049 unsigned long end, tmp;
1050 struct vm_area_struct *vma, *prev;
1051 int unmapped_error = 0;
1052 int error = -EINVAL;
1053 int write;
1054 size_t len;
1055 struct blk_plug plug;
1056
1057 start = untagged_addr(start);
1058
1059 if (!madvise_behavior_valid(behavior))
1060 return error;
1061
1062 if (start & ~PAGE_MASK)
1063 return error;
1064 len = (len_in + ~PAGE_MASK) & PAGE_MASK;
1065
1066
1067 if (len_in && !len)
1068 return error;
1069
1070 end = start + len;
1071 if (end < start)
1072 return error;
1073
1074 error = 0;
1075 if (end == start)
1076 return error;
1077
1078#ifdef CONFIG_MEMORY_FAILURE
1079 if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
1080 return madvise_inject_error(behavior, start, start + len_in);
1081#endif
1082
1083 write = madvise_need_mmap_write(behavior);
1084 if (write) {
1085 if (down_write_killable(¤t->mm->mmap_sem))
1086 return -EINTR;
1087 } else {
1088 down_read(¤t->mm->mmap_sem);
1089 }
1090
1091
1092
1093
1094
1095
1096 vma = find_vma_prev(current->mm, start, &prev);
1097 if (vma && start > vma->vm_start)
1098 prev = vma;
1099
1100 blk_start_plug(&plug);
1101 for (;;) {
1102
1103 error = -ENOMEM;
1104 if (!vma)
1105 goto out;
1106
1107
1108 if (start < vma->vm_start) {
1109 unmapped_error = -ENOMEM;
1110 start = vma->vm_start;
1111 if (start >= end)
1112 goto out;
1113 }
1114
1115
1116 tmp = vma->vm_end;
1117 if (end < tmp)
1118 tmp = end;
1119
1120
1121 error = madvise_vma(vma, &prev, start, tmp, behavior);
1122 if (error)
1123 goto out;
1124 start = tmp;
1125 if (prev && start < prev->vm_end)
1126 start = prev->vm_end;
1127 error = unmapped_error;
1128 if (start >= end)
1129 goto out;
1130 if (prev)
1131 vma = prev->vm_next;
1132 else
1133 vma = find_vma(current->mm, start);
1134 }
1135out:
1136 blk_finish_plug(&plug);
1137 if (write)
1138 up_write(¤t->mm->mmap_sem);
1139 else
1140 up_read(¤t->mm->mmap_sem);
1141
1142 return error;
1143}
1144