1
2
3
4
5
6
7
8
9#include <linux/mman.h>
10#include <linux/pagemap.h>
11#include <linux/syscalls.h>
12#include <linux/mempolicy.h>
13#include <linux/page-isolation.h>
14#include <linux/userfaultfd_k.h>
15#include <linux/hugetlb.h>
16#include <linux/falloc.h>
17#include <linux/sched.h>
18#include <linux/ksm.h>
19#include <linux/fs.h>
20#include <linux/file.h>
21#include <linux/blkdev.h>
22#include <linux/backing-dev.h>
23#include <linux/swap.h>
24#include <linux/swapops.h>
25#include <linux/shmem_fs.h>
26#include <linux/mmu_notifier.h>
27
28#include <asm/tlb.h>
29
30#include "internal.h"
31
32
33
34
35
36
37static int madvise_need_mmap_write(int behavior)
38{
39 switch (behavior) {
40 case MADV_REMOVE:
41 case MADV_WILLNEED:
42 case MADV_DONTNEED:
43 case MADV_FREE:
44 return 0;
45 default:
46
47 return 1;
48 }
49}
50
51
52
53
54
55static long madvise_behavior(struct vm_area_struct *vma,
56 struct vm_area_struct **prev,
57 unsigned long start, unsigned long end, int behavior)
58{
59 struct mm_struct *mm = vma->vm_mm;
60 int error = 0;
61 pgoff_t pgoff;
62 unsigned long new_flags = vma->vm_flags;
63
64 switch (behavior) {
65 case MADV_NORMAL:
66 new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
67 break;
68 case MADV_SEQUENTIAL:
69 new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
70 break;
71 case MADV_RANDOM:
72 new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
73 break;
74 case MADV_DONTFORK:
75 new_flags |= VM_DONTCOPY;
76 break;
77 case MADV_DOFORK:
78 if (vma->vm_flags & VM_IO) {
79 error = -EINVAL;
80 goto out;
81 }
82 new_flags &= ~VM_DONTCOPY;
83 break;
84 case MADV_WIPEONFORK:
85
86 if (vma->vm_file || vma->vm_flags & VM_SHARED) {
87 error = -EINVAL;
88 goto out;
89 }
90 new_flags |= VM_WIPEONFORK;
91 break;
92 case MADV_KEEPONFORK:
93 new_flags &= ~VM_WIPEONFORK;
94 break;
95 case MADV_DONTDUMP:
96 new_flags |= VM_DONTDUMP;
97 break;
98 case MADV_DODUMP:
99 if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) {
100 error = -EINVAL;
101 goto out;
102 }
103 new_flags &= ~VM_DONTDUMP;
104 break;
105 case MADV_MERGEABLE:
106 case MADV_UNMERGEABLE:
107 error = ksm_madvise(vma, start, end, behavior, &new_flags);
108 if (error) {
109
110
111
112
113 if (error == -ENOMEM)
114 error = -EAGAIN;
115 goto out;
116 }
117 break;
118 case MADV_HUGEPAGE:
119 case MADV_NOHUGEPAGE:
120 error = hugepage_madvise(vma, &new_flags, behavior);
121 if (error) {
122
123
124
125
126 if (error == -ENOMEM)
127 error = -EAGAIN;
128 goto out;
129 }
130 break;
131 }
132
133 if (new_flags == vma->vm_flags) {
134 *prev = vma;
135 goto out;
136 }
137
138 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
139 *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
140 vma->vm_file, pgoff, vma_policy(vma),
141 vma->vm_userfaultfd_ctx);
142 if (*prev) {
143 vma = *prev;
144 goto success;
145 }
146
147 *prev = vma;
148
149 if (start != vma->vm_start) {
150 if (unlikely(mm->map_count >= sysctl_max_map_count)) {
151 error = -ENOMEM;
152 goto out;
153 }
154 error = __split_vma(mm, vma, start, 1);
155 if (error) {
156
157
158
159
160 if (error == -ENOMEM)
161 error = -EAGAIN;
162 goto out;
163 }
164 }
165
166 if (end != vma->vm_end) {
167 if (unlikely(mm->map_count >= sysctl_max_map_count)) {
168 error = -ENOMEM;
169 goto out;
170 }
171 error = __split_vma(mm, vma, end, 0);
172 if (error) {
173
174
175
176
177 if (error == -ENOMEM)
178 error = -EAGAIN;
179 goto out;
180 }
181 }
182
183success:
184
185
186
187 vma->vm_flags = new_flags;
188out:
189 return error;
190}
191
192#ifdef CONFIG_SWAP
193static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
194 unsigned long end, struct mm_walk *walk)
195{
196 pte_t *orig_pte;
197 struct vm_area_struct *vma = walk->private;
198 unsigned long index;
199
200 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
201 return 0;
202
203 for (index = start; index != end; index += PAGE_SIZE) {
204 pte_t pte;
205 swp_entry_t entry;
206 struct page *page;
207 spinlock_t *ptl;
208
209 orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
210 pte = *(orig_pte + ((index - start) / PAGE_SIZE));
211 pte_unmap_unlock(orig_pte, ptl);
212
213 if (pte_present(pte) || pte_none(pte))
214 continue;
215 entry = pte_to_swp_entry(pte);
216 if (unlikely(non_swap_entry(entry)))
217 continue;
218
219 page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
220 vma, index, false);
221 if (page)
222 put_page(page);
223 }
224
225 return 0;
226}
227
228static void force_swapin_readahead(struct vm_area_struct *vma,
229 unsigned long start, unsigned long end)
230{
231 struct mm_walk walk = {
232 .mm = vma->vm_mm,
233 .pmd_entry = swapin_walk_pmd_entry,
234 .private = vma,
235 };
236
237 walk_page_range(start, end, &walk);
238
239 lru_add_drain();
240}
241
242static void force_shm_swapin_readahead(struct vm_area_struct *vma,
243 unsigned long start, unsigned long end,
244 struct address_space *mapping)
245{
246 pgoff_t index;
247 struct page *page;
248 swp_entry_t swap;
249
250 for (; start < end; start += PAGE_SIZE) {
251 index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
252
253 page = find_get_entry(mapping, index);
254 if (!xa_is_value(page)) {
255 if (page)
256 put_page(page);
257 continue;
258 }
259 swap = radix_to_swp_entry(page);
260 page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
261 NULL, 0, false);
262 if (page)
263 put_page(page);
264 }
265
266 lru_add_drain();
267}
268#endif
269
270
271
272
273static long madvise_willneed(struct vm_area_struct *vma,
274 struct vm_area_struct **prev,
275 unsigned long start, unsigned long end)
276{
277 struct file *file = vma->vm_file;
278
279 *prev = vma;
280#ifdef CONFIG_SWAP
281 if (!file) {
282 force_swapin_readahead(vma, start, end);
283 return 0;
284 }
285
286 if (shmem_mapping(file->f_mapping)) {
287 force_shm_swapin_readahead(vma, start, end,
288 file->f_mapping);
289 return 0;
290 }
291#else
292 if (!file)
293 return -EBADF;
294#endif
295
296 if (IS_DAX(file_inode(file))) {
297
298 return 0;
299 }
300
301 start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
302 if (end > vma->vm_end)
303 end = vma->vm_end;
304 end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
305
306 force_page_cache_readahead(file->f_mapping, file, start, end - start);
307 return 0;
308}
309
310static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
311 unsigned long end, struct mm_walk *walk)
312
313{
314 struct mmu_gather *tlb = walk->private;
315 struct mm_struct *mm = tlb->mm;
316 struct vm_area_struct *vma = walk->vma;
317 spinlock_t *ptl;
318 pte_t *orig_pte, *pte, ptent;
319 struct page *page;
320 int nr_swap = 0;
321 unsigned long next;
322
323 next = pmd_addr_end(addr, end);
324 if (pmd_trans_huge(*pmd))
325 if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
326 goto next;
327
328 if (pmd_trans_unstable(pmd))
329 return 0;
330
331 tlb_change_page_size(tlb, PAGE_SIZE);
332 orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
333 flush_tlb_batched_pending(mm);
334 arch_enter_lazy_mmu_mode();
335 for (; addr != end; pte++, addr += PAGE_SIZE) {
336 ptent = *pte;
337
338 if (pte_none(ptent))
339 continue;
340
341
342
343
344
345 if (!pte_present(ptent)) {
346 swp_entry_t entry;
347
348 entry = pte_to_swp_entry(ptent);
349 if (non_swap_entry(entry))
350 continue;
351 nr_swap--;
352 free_swap_and_cache(entry);
353 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
354 continue;
355 }
356
357 page = vm_normal_page(vma, addr, ptent);
358 if (!page)
359 continue;
360
361
362
363
364
365
366 if (PageTransCompound(page)) {
367 if (page_mapcount(page) != 1)
368 goto out;
369 get_page(page);
370 if (!trylock_page(page)) {
371 put_page(page);
372 goto out;
373 }
374 pte_unmap_unlock(orig_pte, ptl);
375 if (split_huge_page(page)) {
376 unlock_page(page);
377 put_page(page);
378 pte_offset_map_lock(mm, pmd, addr, &ptl);
379 goto out;
380 }
381 unlock_page(page);
382 put_page(page);
383 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
384 pte--;
385 addr -= PAGE_SIZE;
386 continue;
387 }
388
389 VM_BUG_ON_PAGE(PageTransCompound(page), page);
390
391 if (PageSwapCache(page) || PageDirty(page)) {
392 if (!trylock_page(page))
393 continue;
394
395
396
397
398 if (page_mapcount(page) != 1) {
399 unlock_page(page);
400 continue;
401 }
402
403 if (PageSwapCache(page) && !try_to_free_swap(page)) {
404 unlock_page(page);
405 continue;
406 }
407
408 ClearPageDirty(page);
409 unlock_page(page);
410 }
411
412 if (pte_young(ptent) || pte_dirty(ptent)) {
413
414
415
416
417
418
419 ptent = ptep_get_and_clear_full(mm, addr, pte,
420 tlb->fullmm);
421
422 ptent = pte_mkold(ptent);
423 ptent = pte_mkclean(ptent);
424 set_pte_at(mm, addr, pte, ptent);
425 tlb_remove_tlb_entry(tlb, pte, addr);
426 }
427 mark_page_lazyfree(page);
428 }
429out:
430 if (nr_swap) {
431 if (current->mm == mm)
432 sync_mm_rss(mm);
433
434 add_mm_counter(mm, MM_SWAPENTS, nr_swap);
435 }
436 arch_leave_lazy_mmu_mode();
437 pte_unmap_unlock(orig_pte, ptl);
438 cond_resched();
439next:
440 return 0;
441}
442
443static void madvise_free_page_range(struct mmu_gather *tlb,
444 struct vm_area_struct *vma,
445 unsigned long addr, unsigned long end)
446{
447 struct mm_walk free_walk = {
448 .pmd_entry = madvise_free_pte_range,
449 .mm = vma->vm_mm,
450 .private = tlb,
451 };
452
453 tlb_start_vma(tlb, vma);
454 walk_page_range(addr, end, &free_walk);
455 tlb_end_vma(tlb, vma);
456}
457
458static int madvise_free_single_vma(struct vm_area_struct *vma,
459 unsigned long start_addr, unsigned long end_addr)
460{
461 struct mm_struct *mm = vma->vm_mm;
462 struct mmu_notifier_range range;
463 struct mmu_gather tlb;
464
465
466 if (!vma_is_anonymous(vma))
467 return -EINVAL;
468
469 range.start = max(vma->vm_start, start_addr);
470 if (range.start >= vma->vm_end)
471 return -EINVAL;
472 range.end = min(vma->vm_end, end_addr);
473 if (range.end <= vma->vm_start)
474 return -EINVAL;
475 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
476 range.start, range.end);
477
478 lru_add_drain();
479 tlb_gather_mmu(&tlb, mm, range.start, range.end);
480 update_hiwater_rss(mm);
481
482 mmu_notifier_invalidate_range_start(&range);
483 madvise_free_page_range(&tlb, vma, range.start, range.end);
484 mmu_notifier_invalidate_range_end(&range);
485 tlb_finish_mmu(&tlb, range.start, range.end);
486
487 return 0;
488}
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
510 unsigned long start, unsigned long end)
511{
512 zap_page_range(vma, start, end - start);
513 return 0;
514}
515
516static long madvise_dontneed_free(struct vm_area_struct *vma,
517 struct vm_area_struct **prev,
518 unsigned long start, unsigned long end,
519 int behavior)
520{
521 *prev = vma;
522 if (!can_madv_dontneed_vma(vma))
523 return -EINVAL;
524
525 if (!userfaultfd_remove(vma, start, end)) {
526 *prev = NULL;
527
528 down_read(¤t->mm->mmap_sem);
529 vma = find_vma(current->mm, start);
530 if (!vma)
531 return -ENOMEM;
532 if (start < vma->vm_start) {
533
534
535
536
537
538
539
540
541
542 return -ENOMEM;
543 }
544 if (!can_madv_dontneed_vma(vma))
545 return -EINVAL;
546 if (end > vma->vm_end) {
547
548
549
550
551
552
553
554
555
556
557
558
559 end = vma->vm_end;
560 }
561 VM_WARN_ON(start >= end);
562 }
563
564 if (behavior == MADV_DONTNEED)
565 return madvise_dontneed_single_vma(vma, start, end);
566 else if (behavior == MADV_FREE)
567 return madvise_free_single_vma(vma, start, end);
568 else
569 return -EINVAL;
570}
571
572
573
574
575
576static long madvise_remove(struct vm_area_struct *vma,
577 struct vm_area_struct **prev,
578 unsigned long start, unsigned long end)
579{
580 loff_t offset;
581 int error;
582 struct file *f;
583
584 *prev = NULL;
585
586 if (vma->vm_flags & VM_LOCKED)
587 return -EINVAL;
588
589 f = vma->vm_file;
590
591 if (!f || !f->f_mapping || !f->f_mapping->host) {
592 return -EINVAL;
593 }
594
595 if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
596 return -EACCES;
597
598 offset = (loff_t)(start - vma->vm_start)
599 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
600
601
602
603
604
605
606
607 get_file(f);
608 if (userfaultfd_remove(vma, start, end)) {
609
610 up_read(¤t->mm->mmap_sem);
611 }
612 error = vfs_fallocate(f,
613 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
614 offset, end - start);
615 fput(f);
616 down_read(¤t->mm->mmap_sem);
617 return error;
618}
619
620#ifdef CONFIG_MEMORY_FAILURE
621
622
623
624static int madvise_inject_error(int behavior,
625 unsigned long start, unsigned long end)
626{
627 struct page *page;
628 struct zone *zone;
629 unsigned int order;
630
631 if (!capable(CAP_SYS_ADMIN))
632 return -EPERM;
633
634
635 for (; start < end; start += PAGE_SIZE << order) {
636 unsigned long pfn;
637 int ret;
638
639 ret = get_user_pages_fast(start, 1, 0, &page);
640 if (ret != 1)
641 return ret;
642 pfn = page_to_pfn(page);
643
644
645
646
647
648
649 order = compound_order(compound_head(page));
650
651 if (PageHWPoison(page)) {
652 put_page(page);
653 continue;
654 }
655
656 if (behavior == MADV_SOFT_OFFLINE) {
657 pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
658 pfn, start);
659
660 ret = soft_offline_page(page, MF_COUNT_INCREASED);
661 if (ret)
662 return ret;
663 continue;
664 }
665
666 pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
667 pfn, start);
668
669
670
671
672
673
674
675 put_page(page);
676 ret = memory_failure(pfn, 0);
677 if (ret)
678 return ret;
679 }
680
681
682 for_each_populated_zone(zone)
683 drain_all_pages(zone);
684
685 return 0;
686}
687#endif
688
689static long
690madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
691 unsigned long start, unsigned long end, int behavior)
692{
693 switch (behavior) {
694 case MADV_REMOVE:
695 return madvise_remove(vma, prev, start, end);
696 case MADV_WILLNEED:
697 return madvise_willneed(vma, prev, start, end);
698 case MADV_FREE:
699 case MADV_DONTNEED:
700 return madvise_dontneed_free(vma, prev, start, end, behavior);
701 default:
702 return madvise_behavior(vma, prev, start, end, behavior);
703 }
704}
705
706static bool
707madvise_behavior_valid(int behavior)
708{
709 switch (behavior) {
710 case MADV_DOFORK:
711 case MADV_DONTFORK:
712 case MADV_NORMAL:
713 case MADV_SEQUENTIAL:
714 case MADV_RANDOM:
715 case MADV_REMOVE:
716 case MADV_WILLNEED:
717 case MADV_DONTNEED:
718 case MADV_FREE:
719#ifdef CONFIG_KSM
720 case MADV_MERGEABLE:
721 case MADV_UNMERGEABLE:
722#endif
723#ifdef CONFIG_TRANSPARENT_HUGEPAGE
724 case MADV_HUGEPAGE:
725 case MADV_NOHUGEPAGE:
726#endif
727 case MADV_DONTDUMP:
728 case MADV_DODUMP:
729 case MADV_WIPEONFORK:
730 case MADV_KEEPONFORK:
731#ifdef CONFIG_MEMORY_FAILURE
732 case MADV_SOFT_OFFLINE:
733 case MADV_HWPOISON:
734#endif
735 return true;
736
737 default:
738 return false;
739 }
740}
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
804{
805 unsigned long end, tmp;
806 struct vm_area_struct *vma, *prev;
807 int unmapped_error = 0;
808 int error = -EINVAL;
809 int write;
810 size_t len;
811 struct blk_plug plug;
812
813 if (!madvise_behavior_valid(behavior))
814 return error;
815
816 if (start & ~PAGE_MASK)
817 return error;
818 len = (len_in + ~PAGE_MASK) & PAGE_MASK;
819
820
821 if (len_in && !len)
822 return error;
823
824 end = start + len;
825 if (end < start)
826 return error;
827
828 error = 0;
829 if (end == start)
830 return error;
831
832#ifdef CONFIG_MEMORY_FAILURE
833 if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
834 return madvise_inject_error(behavior, start, start + len_in);
835#endif
836
837 write = madvise_need_mmap_write(behavior);
838 if (write) {
839 if (down_write_killable(¤t->mm->mmap_sem))
840 return -EINTR;
841 } else {
842 down_read(¤t->mm->mmap_sem);
843 }
844
845
846
847
848
849
850 vma = find_vma_prev(current->mm, start, &prev);
851 if (vma && start > vma->vm_start)
852 prev = vma;
853
854 blk_start_plug(&plug);
855 for (;;) {
856
857 error = -ENOMEM;
858 if (!vma)
859 goto out;
860
861
862 if (start < vma->vm_start) {
863 unmapped_error = -ENOMEM;
864 start = vma->vm_start;
865 if (start >= end)
866 goto out;
867 }
868
869
870 tmp = vma->vm_end;
871 if (end < tmp)
872 tmp = end;
873
874
875 error = madvise_vma(vma, &prev, start, tmp, behavior);
876 if (error)
877 goto out;
878 start = tmp;
879 if (prev && start < prev->vm_end)
880 start = prev->vm_end;
881 error = unmapped_error;
882 if (start >= end)
883 goto out;
884 if (prev)
885 vma = prev->vm_next;
886 else
887 vma = find_vma(current->mm, start);
888 }
889out:
890 blk_finish_plug(&plug);
891 if (write)
892 up_write(¤t->mm->mmap_sem);
893 else
894 up_read(¤t->mm->mmap_sem);
895
896 return error;
897}
898