1
2
3
4
5
6
7
8
9#include <linux/mman.h>
10#include <linux/pagemap.h>
11#include <linux/syscalls.h>
12#include <linux/mempolicy.h>
13#include <linux/page-isolation.h>
14#include <linux/userfaultfd_k.h>
15#include <linux/hugetlb.h>
16#include <linux/falloc.h>
17#include <linux/sched.h>
18#include <linux/ksm.h>
19#include <linux/fs.h>
20#include <linux/file.h>
21#include <linux/blkdev.h>
22#include <linux/backing-dev.h>
23#include <linux/swap.h>
24#include <linux/swapops.h>
25#include <linux/shmem_fs.h>
26#include <linux/mmu_notifier.h>
27
28#include <asm/tlb.h>
29
30#include "internal.h"
31
32
33
34
35
36
37static int madvise_need_mmap_write(int behavior)
38{
39 switch (behavior) {
40 case MADV_REMOVE:
41 case MADV_WILLNEED:
42 case MADV_DONTNEED:
43 case MADV_FREE:
44 return 0;
45 default:
46
47 return 1;
48 }
49}
50
51
52
53
54
55static long madvise_behavior(struct vm_area_struct *vma,
56 struct vm_area_struct **prev,
57 unsigned long start, unsigned long end, int behavior)
58{
59 struct mm_struct *mm = vma->vm_mm;
60 int error = 0;
61 pgoff_t pgoff;
62 unsigned long new_flags = vma->vm_flags;
63
64 switch (behavior) {
65 case MADV_NORMAL:
66 new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
67 break;
68 case MADV_SEQUENTIAL:
69 new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
70 break;
71 case MADV_RANDOM:
72 new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
73 break;
74 case MADV_DONTFORK:
75 new_flags |= VM_DONTCOPY;
76 break;
77 case MADV_DOFORK:
78 if (vma->vm_flags & VM_IO) {
79 error = -EINVAL;
80 goto out;
81 }
82 new_flags &= ~VM_DONTCOPY;
83 break;
84 case MADV_WIPEONFORK:
85
86 if (vma->vm_file || vma->vm_flags & VM_SHARED) {
87 error = -EINVAL;
88 goto out;
89 }
90 new_flags |= VM_WIPEONFORK;
91 break;
92 case MADV_KEEPONFORK:
93 new_flags &= ~VM_WIPEONFORK;
94 break;
95 case MADV_DONTDUMP:
96 new_flags |= VM_DONTDUMP;
97 break;
98 case MADV_DODUMP:
99 if (new_flags & VM_SPECIAL) {
100 error = -EINVAL;
101 goto out;
102 }
103 new_flags &= ~VM_DONTDUMP;
104 break;
105 case MADV_MERGEABLE:
106 case MADV_UNMERGEABLE:
107 error = ksm_madvise(vma, start, end, behavior, &new_flags);
108 if (error) {
109
110
111
112
113 if (error == -ENOMEM)
114 error = -EAGAIN;
115 goto out;
116 }
117 break;
118 case MADV_HUGEPAGE:
119 case MADV_NOHUGEPAGE:
120 error = hugepage_madvise(vma, &new_flags, behavior);
121 if (error) {
122
123
124
125
126 if (error == -ENOMEM)
127 error = -EAGAIN;
128 goto out;
129 }
130 break;
131 }
132
133 if (new_flags == vma->vm_flags) {
134 *prev = vma;
135 goto out;
136 }
137
138 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
139 *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
140 vma->vm_file, pgoff, vma_policy(vma),
141 vma->vm_userfaultfd_ctx);
142 if (*prev) {
143 vma = *prev;
144 goto success;
145 }
146
147 *prev = vma;
148
149 if (start != vma->vm_start) {
150 if (unlikely(mm->map_count >= sysctl_max_map_count)) {
151 error = -ENOMEM;
152 goto out;
153 }
154 error = __split_vma(mm, vma, start, 1);
155 if (error) {
156
157
158
159
160 if (error == -ENOMEM)
161 error = -EAGAIN;
162 goto out;
163 }
164 }
165
166 if (end != vma->vm_end) {
167 if (unlikely(mm->map_count >= sysctl_max_map_count)) {
168 error = -ENOMEM;
169 goto out;
170 }
171 error = __split_vma(mm, vma, end, 0);
172 if (error) {
173
174
175
176
177 if (error == -ENOMEM)
178 error = -EAGAIN;
179 goto out;
180 }
181 }
182
183success:
184
185
186
187 vma->vm_flags = new_flags;
188out:
189 return error;
190}
191
192#ifdef CONFIG_SWAP
193static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
194 unsigned long end, struct mm_walk *walk)
195{
196 pte_t *orig_pte;
197 struct vm_area_struct *vma = walk->private;
198 unsigned long index;
199
200 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
201 return 0;
202
203 for (index = start; index != end; index += PAGE_SIZE) {
204 pte_t pte;
205 swp_entry_t entry;
206 struct page *page;
207 spinlock_t *ptl;
208
209 orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
210 pte = *(orig_pte + ((index - start) / PAGE_SIZE));
211 pte_unmap_unlock(orig_pte, ptl);
212
213 if (pte_present(pte) || pte_none(pte))
214 continue;
215 entry = pte_to_swp_entry(pte);
216 if (unlikely(non_swap_entry(entry)))
217 continue;
218
219 page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
220 vma, index, false);
221 if (page)
222 put_page(page);
223 }
224
225 return 0;
226}
227
228static void force_swapin_readahead(struct vm_area_struct *vma,
229 unsigned long start, unsigned long end)
230{
231 struct mm_walk walk = {
232 .mm = vma->vm_mm,
233 .pmd_entry = swapin_walk_pmd_entry,
234 .private = vma,
235 };
236
237 walk_page_range(start, end, &walk);
238
239 lru_add_drain();
240}
241
242static void force_shm_swapin_readahead(struct vm_area_struct *vma,
243 unsigned long start, unsigned long end,
244 struct address_space *mapping)
245{
246 pgoff_t index;
247 struct page *page;
248 swp_entry_t swap;
249
250 for (; start < end; start += PAGE_SIZE) {
251 index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
252
253 page = find_get_entry(mapping, index);
254 if (!radix_tree_exceptional_entry(page)) {
255 if (page)
256 put_page(page);
257 continue;
258 }
259 swap = radix_to_swp_entry(page);
260 page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
261 NULL, 0, false);
262 if (page)
263 put_page(page);
264 }
265
266 lru_add_drain();
267}
268#endif
269
270
271
272
273static long madvise_willneed(struct vm_area_struct *vma,
274 struct vm_area_struct **prev,
275 unsigned long start, unsigned long end)
276{
277 struct file *file = vma->vm_file;
278
279 *prev = vma;
280#ifdef CONFIG_SWAP
281 if (!file) {
282 force_swapin_readahead(vma, start, end);
283 return 0;
284 }
285
286 if (shmem_mapping(file->f_mapping)) {
287 force_shm_swapin_readahead(vma, start, end,
288 file->f_mapping);
289 return 0;
290 }
291#else
292 if (!file)
293 return -EBADF;
294#endif
295
296 if (IS_DAX(file_inode(file))) {
297
298 return 0;
299 }
300
301 start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
302 if (end > vma->vm_end)
303 end = vma->vm_end;
304 end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
305
306 force_page_cache_readahead(file->f_mapping, file, start, end - start);
307 return 0;
308}
309
310static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
311 unsigned long end, struct mm_walk *walk)
312
313{
314 struct mmu_gather *tlb = walk->private;
315 struct mm_struct *mm = tlb->mm;
316 struct vm_area_struct *vma = walk->vma;
317 spinlock_t *ptl;
318 pte_t *orig_pte, *pte, ptent;
319 struct page *page;
320 int nr_swap = 0;
321 unsigned long next;
322
323 next = pmd_addr_end(addr, end);
324 if (pmd_trans_huge(*pmd))
325 if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
326 goto next;
327
328 if (pmd_trans_unstable(pmd))
329 return 0;
330
331 tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
332 orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
333 flush_tlb_batched_pending(mm);
334 arch_enter_lazy_mmu_mode();
335 for (; addr != end; pte++, addr += PAGE_SIZE) {
336 ptent = *pte;
337
338 if (pte_none(ptent))
339 continue;
340
341
342
343
344
345 if (!pte_present(ptent)) {
346 swp_entry_t entry;
347
348 entry = pte_to_swp_entry(ptent);
349 if (non_swap_entry(entry))
350 continue;
351 nr_swap--;
352 free_swap_and_cache(entry);
353 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
354 continue;
355 }
356
357 page = _vm_normal_page(vma, addr, ptent, true);
358 if (!page)
359 continue;
360
361
362
363
364
365
366 if (PageTransCompound(page)) {
367 if (page_mapcount(page) != 1)
368 goto out;
369 get_page(page);
370 if (!trylock_page(page)) {
371 put_page(page);
372 goto out;
373 }
374 pte_unmap_unlock(orig_pte, ptl);
375 if (split_huge_page(page)) {
376 unlock_page(page);
377 put_page(page);
378 pte_offset_map_lock(mm, pmd, addr, &ptl);
379 goto out;
380 }
381 unlock_page(page);
382 put_page(page);
383 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
384 pte--;
385 addr -= PAGE_SIZE;
386 continue;
387 }
388
389 VM_BUG_ON_PAGE(PageTransCompound(page), page);
390
391 if (PageSwapCache(page) || PageDirty(page)) {
392 if (!trylock_page(page))
393 continue;
394
395
396
397
398 if (page_mapcount(page) != 1) {
399 unlock_page(page);
400 continue;
401 }
402
403 if (PageSwapCache(page) && !try_to_free_swap(page)) {
404 unlock_page(page);
405 continue;
406 }
407
408 ClearPageDirty(page);
409 unlock_page(page);
410 }
411
412 if (pte_young(ptent) || pte_dirty(ptent)) {
413
414
415
416
417
418
419 ptent = ptep_get_and_clear_full(mm, addr, pte,
420 tlb->fullmm);
421
422 ptent = pte_mkold(ptent);
423 ptent = pte_mkclean(ptent);
424 set_pte_at(mm, addr, pte, ptent);
425 tlb_remove_tlb_entry(tlb, pte, addr);
426 }
427 mark_page_lazyfree(page);
428 }
429out:
430 if (nr_swap) {
431 if (current->mm == mm)
432 sync_mm_rss(mm);
433
434 add_mm_counter(mm, MM_SWAPENTS, nr_swap);
435 }
436 arch_leave_lazy_mmu_mode();
437 pte_unmap_unlock(orig_pte, ptl);
438 cond_resched();
439next:
440 return 0;
441}
442
443static void madvise_free_page_range(struct mmu_gather *tlb,
444 struct vm_area_struct *vma,
445 unsigned long addr, unsigned long end)
446{
447 struct mm_walk free_walk = {
448 .pmd_entry = madvise_free_pte_range,
449 .mm = vma->vm_mm,
450 .private = tlb,
451 };
452
453 tlb_start_vma(tlb, vma);
454 walk_page_range(addr, end, &free_walk);
455 tlb_end_vma(tlb, vma);
456}
457
458static int madvise_free_single_vma(struct vm_area_struct *vma,
459 unsigned long start_addr, unsigned long end_addr)
460{
461 unsigned long start, end;
462 struct mm_struct *mm = vma->vm_mm;
463 struct mmu_gather tlb;
464
465
466 if (!vma_is_anonymous(vma))
467 return -EINVAL;
468
469 start = max(vma->vm_start, start_addr);
470 if (start >= vma->vm_end)
471 return -EINVAL;
472 end = min(vma->vm_end, end_addr);
473 if (end <= vma->vm_start)
474 return -EINVAL;
475
476 lru_add_drain();
477 tlb_gather_mmu(&tlb, mm, start, end);
478 update_hiwater_rss(mm);
479
480 mmu_notifier_invalidate_range_start(mm, start, end);
481 madvise_free_page_range(&tlb, vma, start, end);
482 mmu_notifier_invalidate_range_end(mm, start, end);
483 tlb_finish_mmu(&tlb, start, end);
484
485 return 0;
486}
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
508 unsigned long start, unsigned long end)
509{
510 zap_page_range(vma, start, end - start);
511 return 0;
512}
513
514static long madvise_dontneed_free(struct vm_area_struct *vma,
515 struct vm_area_struct **prev,
516 unsigned long start, unsigned long end,
517 int behavior)
518{
519 *prev = vma;
520 if (!can_madv_dontneed_vma(vma))
521 return -EINVAL;
522
523 if (!userfaultfd_remove(vma, start, end)) {
524 *prev = NULL;
525
526 down_read(¤t->mm->mmap_sem);
527 vma = find_vma(current->mm, start);
528 if (!vma)
529 return -ENOMEM;
530 if (start < vma->vm_start) {
531
532
533
534
535
536
537
538
539
540 return -ENOMEM;
541 }
542 if (!can_madv_dontneed_vma(vma))
543 return -EINVAL;
544 if (end > vma->vm_end) {
545
546
547
548
549
550
551
552
553
554
555
556
557 end = vma->vm_end;
558 }
559 VM_WARN_ON(start >= end);
560 }
561
562 if (behavior == MADV_DONTNEED)
563 return madvise_dontneed_single_vma(vma, start, end);
564 else if (behavior == MADV_FREE)
565 return madvise_free_single_vma(vma, start, end);
566 else
567 return -EINVAL;
568}
569
570
571
572
573
574static long madvise_remove(struct vm_area_struct *vma,
575 struct vm_area_struct **prev,
576 unsigned long start, unsigned long end)
577{
578 loff_t offset;
579 int error;
580 struct file *f;
581
582 *prev = NULL;
583
584 if (vma->vm_flags & VM_LOCKED)
585 return -EINVAL;
586
587 f = vma->vm_file;
588
589 if (!f || !f->f_mapping || !f->f_mapping->host) {
590 return -EINVAL;
591 }
592
593 if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
594 return -EACCES;
595
596 offset = (loff_t)(start - vma->vm_start)
597 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
598
599
600
601
602
603
604
605 get_file(f);
606 if (userfaultfd_remove(vma, start, end)) {
607
608 up_read(¤t->mm->mmap_sem);
609 }
610 error = vfs_fallocate(f,
611 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
612 offset, end - start);
613 fput(f);
614 down_read(¤t->mm->mmap_sem);
615 return error;
616}
617
618#ifdef CONFIG_MEMORY_FAILURE
619
620
621
622static int madvise_inject_error(int behavior,
623 unsigned long start, unsigned long end)
624{
625 struct page *page;
626 struct zone *zone;
627 unsigned int order;
628
629 if (!capable(CAP_SYS_ADMIN))
630 return -EPERM;
631
632
633 for (; start < end; start += PAGE_SIZE << order) {
634 int ret;
635
636 ret = get_user_pages_fast(start, 1, 0, &page);
637 if (ret != 1)
638 return ret;
639
640
641
642
643
644
645 order = compound_order(compound_head(page));
646
647 if (PageHWPoison(page)) {
648 put_page(page);
649 continue;
650 }
651
652 if (behavior == MADV_SOFT_OFFLINE) {
653 pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
654 page_to_pfn(page), start);
655
656 ret = soft_offline_page(page, MF_COUNT_INCREASED);
657 if (ret)
658 return ret;
659 continue;
660 }
661 pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
662 page_to_pfn(page), start);
663
664 ret = memory_failure(page_to_pfn(page), MF_COUNT_INCREASED);
665 if (ret)
666 return ret;
667 }
668
669
670 for_each_populated_zone(zone)
671 drain_all_pages(zone);
672
673 return 0;
674}
675#endif
676
677static long
678madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
679 unsigned long start, unsigned long end, int behavior)
680{
681 switch (behavior) {
682 case MADV_REMOVE:
683 return madvise_remove(vma, prev, start, end);
684 case MADV_WILLNEED:
685 return madvise_willneed(vma, prev, start, end);
686 case MADV_FREE:
687 case MADV_DONTNEED:
688 return madvise_dontneed_free(vma, prev, start, end, behavior);
689 default:
690 return madvise_behavior(vma, prev, start, end, behavior);
691 }
692}
693
694static bool
695madvise_behavior_valid(int behavior)
696{
697 switch (behavior) {
698 case MADV_DOFORK:
699 case MADV_DONTFORK:
700 case MADV_NORMAL:
701 case MADV_SEQUENTIAL:
702 case MADV_RANDOM:
703 case MADV_REMOVE:
704 case MADV_WILLNEED:
705 case MADV_DONTNEED:
706 case MADV_FREE:
707#ifdef CONFIG_KSM
708 case MADV_MERGEABLE:
709 case MADV_UNMERGEABLE:
710#endif
711#ifdef CONFIG_TRANSPARENT_HUGEPAGE
712 case MADV_HUGEPAGE:
713 case MADV_NOHUGEPAGE:
714#endif
715 case MADV_DONTDUMP:
716 case MADV_DODUMP:
717 case MADV_WIPEONFORK:
718 case MADV_KEEPONFORK:
719#ifdef CONFIG_MEMORY_FAILURE
720 case MADV_SOFT_OFFLINE:
721 case MADV_HWPOISON:
722#endif
723 return true;
724
725 default:
726 return false;
727 }
728}
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
792{
793 unsigned long end, tmp;
794 struct vm_area_struct *vma, *prev;
795 int unmapped_error = 0;
796 int error = -EINVAL;
797 int write;
798 size_t len;
799 struct blk_plug plug;
800
801 if (!madvise_behavior_valid(behavior))
802 return error;
803
804 if (start & ~PAGE_MASK)
805 return error;
806 len = (len_in + ~PAGE_MASK) & PAGE_MASK;
807
808
809 if (len_in && !len)
810 return error;
811
812 end = start + len;
813 if (end < start)
814 return error;
815
816 error = 0;
817 if (end == start)
818 return error;
819
820#ifdef CONFIG_MEMORY_FAILURE
821 if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
822 return madvise_inject_error(behavior, start, start + len_in);
823#endif
824
825 write = madvise_need_mmap_write(behavior);
826 if (write) {
827 if (down_write_killable(¤t->mm->mmap_sem))
828 return -EINTR;
829 } else {
830 down_read(¤t->mm->mmap_sem);
831 }
832
833
834
835
836
837
838 vma = find_vma_prev(current->mm, start, &prev);
839 if (vma && start > vma->vm_start)
840 prev = vma;
841
842 blk_start_plug(&plug);
843 for (;;) {
844
845 error = -ENOMEM;
846 if (!vma)
847 goto out;
848
849
850 if (start < vma->vm_start) {
851 unmapped_error = -ENOMEM;
852 start = vma->vm_start;
853 if (start >= end)
854 goto out;
855 }
856
857
858 tmp = vma->vm_end;
859 if (end < tmp)
860 tmp = end;
861
862
863 error = madvise_vma(vma, &prev, start, tmp, behavior);
864 if (error)
865 goto out;
866 start = tmp;
867 if (prev && start < prev->vm_end)
868 start = prev->vm_end;
869 error = unmapped_error;
870 if (start >= end)
871 goto out;
872 if (prev)
873 vma = prev->vm_next;
874 else
875 vma = find_vma(current->mm, start);
876 }
877out:
878 blk_finish_plug(&plug);
879 if (write)
880 up_write(¤t->mm->mmap_sem);
881 else
882 up_read(¤t->mm->mmap_sem);
883
884 return error;
885}
886