1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41#include <linux/kernel_stat.h>
42#include <linux/mm.h>
43#include <linux/sched/mm.h>
44#include <linux/sched/coredump.h>
45#include <linux/sched/numa_balancing.h>
46#include <linux/sched/task.h>
47#include <linux/hugetlb.h>
48#include <linux/mman.h>
49#include <linux/swap.h>
50#include <linux/highmem.h>
51#include <linux/pagemap.h>
52#include <linux/memremap.h>
53#include <linux/ksm.h>
54#include <linux/rmap.h>
55#include <linux/export.h>
56#include <linux/delayacct.h>
57#include <linux/init.h>
58#include <linux/pfn_t.h>
59#include <linux/writeback.h>
60#include <linux/memcontrol.h>
61#include <linux/mmu_notifier.h>
62#include <linux/swapops.h>
63#include <linux/elf.h>
64#include <linux/gfp.h>
65#include <linux/migrate.h>
66#include <linux/string.h>
67#include <linux/debugfs.h>
68#include <linux/userfaultfd_k.h>
69#include <linux/dax.h>
70#include <linux/oom.h>
71#include <linux/numa.h>
72
73#include <asm/io.h>
74#include <asm/mmu_context.h>
75#include <asm/pgalloc.h>
76#include <linux/uaccess.h>
77#include <asm/tlb.h>
78#include <asm/tlbflush.h>
79#include <asm/pgtable.h>
80
81#include "internal.h"
82
83#if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST)
84#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
85#endif
86
87#ifndef CONFIG_NEED_MULTIPLE_NODES
88
89unsigned long max_mapnr;
90EXPORT_SYMBOL(max_mapnr);
91
92struct page *mem_map;
93EXPORT_SYMBOL(mem_map);
94#endif
95
96
97
98
99
100
101
102
103void *high_memory;
104EXPORT_SYMBOL(high_memory);
105
106
107
108
109
110
111
112int randomize_va_space __read_mostly =
113#ifdef CONFIG_COMPAT_BRK
114 1;
115#else
116 2;
117#endif
118
119static int __init disable_randmaps(char *s)
120{
121 randomize_va_space = 0;
122 return 1;
123}
124__setup("norandmaps", disable_randmaps);
125
126unsigned long zero_pfn __read_mostly;
127EXPORT_SYMBOL(zero_pfn);
128
129unsigned long highest_memmap_pfn __read_mostly;
130
131
132
133
134static int __init init_zero_pfn(void)
135{
136 zero_pfn = page_to_pfn(ZERO_PAGE(0));
137 return 0;
138}
139core_initcall(init_zero_pfn);
140
141
142#if defined(SPLIT_RSS_COUNTING)
143
144void sync_mm_rss(struct mm_struct *mm)
145{
146 int i;
147
148 for (i = 0; i < NR_MM_COUNTERS; i++) {
149 if (current->rss_stat.count[i]) {
150 add_mm_counter(mm, i, current->rss_stat.count[i]);
151 current->rss_stat.count[i] = 0;
152 }
153 }
154 current->rss_stat.events = 0;
155}
156
157static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
158{
159 struct task_struct *task = current;
160
161 if (likely(task->mm == mm))
162 task->rss_stat.count[member] += val;
163 else
164 add_mm_counter(mm, member, val);
165}
166#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
167#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
168
169
170#define TASK_RSS_EVENTS_THRESH (64)
171static void check_sync_rss_stat(struct task_struct *task)
172{
173 if (unlikely(task != current))
174 return;
175 if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
176 sync_mm_rss(task->mm);
177}
178#else
179
180#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
181#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
182
183static void check_sync_rss_stat(struct task_struct *task)
184{
185}
186
187#endif
188
189
190
191
192
193static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
194 unsigned long addr)
195{
196 pgtable_t token = pmd_pgtable(*pmd);
197 pmd_clear(pmd);
198 pte_free_tlb(tlb, token, addr);
199 mm_dec_nr_ptes(tlb->mm);
200}
201
202static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
203 unsigned long addr, unsigned long end,
204 unsigned long floor, unsigned long ceiling)
205{
206 pmd_t *pmd;
207 unsigned long next;
208 unsigned long start;
209
210 start = addr;
211 pmd = pmd_offset(pud, addr);
212 do {
213 next = pmd_addr_end(addr, end);
214 if (pmd_none_or_clear_bad(pmd))
215 continue;
216 free_pte_range(tlb, pmd, addr);
217 } while (pmd++, addr = next, addr != end);
218
219 start &= PUD_MASK;
220 if (start < floor)
221 return;
222 if (ceiling) {
223 ceiling &= PUD_MASK;
224 if (!ceiling)
225 return;
226 }
227 if (end - 1 > ceiling - 1)
228 return;
229
230 pmd = pmd_offset(pud, start);
231 pud_clear(pud);
232 pmd_free_tlb(tlb, pmd, start);
233 mm_dec_nr_pmds(tlb->mm);
234}
235
236static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
237 unsigned long addr, unsigned long end,
238 unsigned long floor, unsigned long ceiling)
239{
240 pud_t *pud;
241 unsigned long next;
242 unsigned long start;
243
244 start = addr;
245 pud = pud_offset(p4d, addr);
246 do {
247 next = pud_addr_end(addr, end);
248 if (pud_none_or_clear_bad(pud))
249 continue;
250 free_pmd_range(tlb, pud, addr, next, floor, ceiling);
251 } while (pud++, addr = next, addr != end);
252
253 start &= P4D_MASK;
254 if (start < floor)
255 return;
256 if (ceiling) {
257 ceiling &= P4D_MASK;
258 if (!ceiling)
259 return;
260 }
261 if (end - 1 > ceiling - 1)
262 return;
263
264 pud = pud_offset(p4d, start);
265 p4d_clear(p4d);
266 pud_free_tlb(tlb, pud, start);
267 mm_dec_nr_puds(tlb->mm);
268}
269
270static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
271 unsigned long addr, unsigned long end,
272 unsigned long floor, unsigned long ceiling)
273{
274 p4d_t *p4d;
275 unsigned long next;
276 unsigned long start;
277
278 start = addr;
279 p4d = p4d_offset(pgd, addr);
280 do {
281 next = p4d_addr_end(addr, end);
282 if (p4d_none_or_clear_bad(p4d))
283 continue;
284 free_pud_range(tlb, p4d, addr, next, floor, ceiling);
285 } while (p4d++, addr = next, addr != end);
286
287 start &= PGDIR_MASK;
288 if (start < floor)
289 return;
290 if (ceiling) {
291 ceiling &= PGDIR_MASK;
292 if (!ceiling)
293 return;
294 }
295 if (end - 1 > ceiling - 1)
296 return;
297
298 p4d = p4d_offset(pgd, start);
299 pgd_clear(pgd);
300 p4d_free_tlb(tlb, p4d, start);
301}
302
303
304
305
306void free_pgd_range(struct mmu_gather *tlb,
307 unsigned long addr, unsigned long end,
308 unsigned long floor, unsigned long ceiling)
309{
310 pgd_t *pgd;
311 unsigned long next;
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339 addr &= PMD_MASK;
340 if (addr < floor) {
341 addr += PMD_SIZE;
342 if (!addr)
343 return;
344 }
345 if (ceiling) {
346 ceiling &= PMD_MASK;
347 if (!ceiling)
348 return;
349 }
350 if (end - 1 > ceiling - 1)
351 end -= PMD_SIZE;
352 if (addr > end - 1)
353 return;
354
355
356
357
358 tlb_change_page_size(tlb, PAGE_SIZE);
359 pgd = pgd_offset(tlb->mm, addr);
360 do {
361 next = pgd_addr_end(addr, end);
362 if (pgd_none_or_clear_bad(pgd))
363 continue;
364 free_p4d_range(tlb, pgd, addr, next, floor, ceiling);
365 } while (pgd++, addr = next, addr != end);
366}
367
368void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
369 unsigned long floor, unsigned long ceiling)
370{
371 while (vma) {
372 struct vm_area_struct *next = vma->vm_next;
373 unsigned long addr = vma->vm_start;
374
375
376
377
378
379 unlink_anon_vmas(vma);
380 unlink_file_vma(vma);
381
382 if (is_vm_hugetlb_page(vma)) {
383 hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
384 floor, next ? next->vm_start : ceiling);
385 } else {
386
387
388
389 while (next && next->vm_start <= vma->vm_end + PMD_SIZE
390 && !is_vm_hugetlb_page(next)) {
391 vma = next;
392 next = vma->vm_next;
393 unlink_anon_vmas(vma);
394 unlink_file_vma(vma);
395 }
396 free_pgd_range(tlb, addr, vma->vm_end,
397 floor, next ? next->vm_start : ceiling);
398 }
399 vma = next;
400 }
401}
402
403int __pte_alloc(struct mm_struct *mm, pmd_t *pmd)
404{
405 spinlock_t *ptl;
406 pgtable_t new = pte_alloc_one(mm);
407 if (!new)
408 return -ENOMEM;
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423 smp_wmb();
424
425 ptl = pmd_lock(mm, pmd);
426 if (likely(pmd_none(*pmd))) {
427 mm_inc_nr_ptes(mm);
428 pmd_populate(mm, pmd, new);
429 new = NULL;
430 }
431 spin_unlock(ptl);
432 if (new)
433 pte_free(mm, new);
434 return 0;
435}
436
437int __pte_alloc_kernel(pmd_t *pmd)
438{
439 pte_t *new = pte_alloc_one_kernel(&init_mm);
440 if (!new)
441 return -ENOMEM;
442
443 smp_wmb();
444
445 spin_lock(&init_mm.page_table_lock);
446 if (likely(pmd_none(*pmd))) {
447 pmd_populate_kernel(&init_mm, pmd, new);
448 new = NULL;
449 }
450 spin_unlock(&init_mm.page_table_lock);
451 if (new)
452 pte_free_kernel(&init_mm, new);
453 return 0;
454}
455
456static inline void init_rss_vec(int *rss)
457{
458 memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
459}
460
461static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
462{
463 int i;
464
465 if (current->mm == mm)
466 sync_mm_rss(mm);
467 for (i = 0; i < NR_MM_COUNTERS; i++)
468 if (rss[i])
469 add_mm_counter(mm, i, rss[i]);
470}
471
472
473
474
475
476
477
478
479static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
480 pte_t pte, struct page *page)
481{
482 pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
483 p4d_t *p4d = p4d_offset(pgd, addr);
484 pud_t *pud = pud_offset(p4d, addr);
485 pmd_t *pmd = pmd_offset(pud, addr);
486 struct address_space *mapping;
487 pgoff_t index;
488 static unsigned long resume;
489 static unsigned long nr_shown;
490 static unsigned long nr_unshown;
491
492
493
494
495
496 if (nr_shown == 60) {
497 if (time_before(jiffies, resume)) {
498 nr_unshown++;
499 return;
500 }
501 if (nr_unshown) {
502 pr_alert("BUG: Bad page map: %lu messages suppressed\n",
503 nr_unshown);
504 nr_unshown = 0;
505 }
506 nr_shown = 0;
507 }
508 if (nr_shown++ == 0)
509 resume = jiffies + 60 * HZ;
510
511 mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
512 index = linear_page_index(vma, addr);
513
514 pr_alert("BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
515 current->comm,
516 (long long)pte_val(pte), (long long)pmd_val(*pmd));
517 if (page)
518 dump_page(page, "bad pte");
519 pr_alert("addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
520 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
521 pr_alert("file:%pD fault:%ps mmap:%ps readpage:%ps\n",
522 vma->vm_file,
523 vma->vm_ops ? vma->vm_ops->fault : NULL,
524 vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
525 mapping ? mapping->a_ops->readpage : NULL);
526 dump_stack();
527 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
528}
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
573 pte_t pte)
574{
575 unsigned long pfn = pte_pfn(pte);
576
577 if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) {
578 if (likely(!pte_special(pte)))
579 goto check_pfn;
580 if (vma->vm_ops && vma->vm_ops->find_special_page)
581 return vma->vm_ops->find_special_page(vma, addr);
582 if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
583 return NULL;
584 if (is_zero_pfn(pfn))
585 return NULL;
586 if (pte_devmap(pte))
587 return NULL;
588
589 print_bad_pte(vma, addr, pte, NULL);
590 return NULL;
591 }
592
593
594
595 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
596 if (vma->vm_flags & VM_MIXEDMAP) {
597 if (!pfn_valid(pfn))
598 return NULL;
599 goto out;
600 } else {
601 unsigned long off;
602 off = (addr - vma->vm_start) >> PAGE_SHIFT;
603 if (pfn == vma->vm_pgoff + off)
604 return NULL;
605 if (!is_cow_mapping(vma->vm_flags))
606 return NULL;
607 }
608 }
609
610 if (is_zero_pfn(pfn))
611 return NULL;
612
613check_pfn:
614 if (unlikely(pfn > highest_memmap_pfn)) {
615 print_bad_pte(vma, addr, pte, NULL);
616 return NULL;
617 }
618
619
620
621
622
623out:
624 return pfn_to_page(pfn);
625}
626
627#ifdef CONFIG_TRANSPARENT_HUGEPAGE
628struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
629 pmd_t pmd)
630{
631 unsigned long pfn = pmd_pfn(pmd);
632
633
634
635
636
637
638 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
639 if (vma->vm_flags & VM_MIXEDMAP) {
640 if (!pfn_valid(pfn))
641 return NULL;
642 goto out;
643 } else {
644 unsigned long off;
645 off = (addr - vma->vm_start) >> PAGE_SHIFT;
646 if (pfn == vma->vm_pgoff + off)
647 return NULL;
648 if (!is_cow_mapping(vma->vm_flags))
649 return NULL;
650 }
651 }
652
653 if (pmd_devmap(pmd))
654 return NULL;
655 if (is_zero_pfn(pfn))
656 return NULL;
657 if (unlikely(pfn > highest_memmap_pfn))
658 return NULL;
659
660
661
662
663
664out:
665 return pfn_to_page(pfn);
666}
667#endif
668
669
670
671
672
673
674
675static unsigned long
676copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
677 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
678 unsigned long addr, int *rss)
679{
680 unsigned long vm_flags = vma->vm_flags;
681 pte_t pte = *src_pte;
682 struct page *page;
683 swp_entry_t entry = pte_to_swp_entry(pte);
684
685 if (likely(!non_swap_entry(entry))) {
686 if (swap_duplicate(entry) < 0)
687 return entry.val;
688
689
690 if (unlikely(list_empty(&dst_mm->mmlist))) {
691 spin_lock(&mmlist_lock);
692 if (list_empty(&dst_mm->mmlist))
693 list_add(&dst_mm->mmlist,
694 &src_mm->mmlist);
695 spin_unlock(&mmlist_lock);
696 }
697 rss[MM_SWAPENTS]++;
698 } else if (is_migration_entry(entry)) {
699 page = migration_entry_to_page(entry);
700
701 rss[mm_counter(page)]++;
702
703 if (is_write_migration_entry(entry) &&
704 is_cow_mapping(vm_flags)) {
705
706
707
708
709 make_migration_entry_read(&entry);
710 pte = swp_entry_to_pte(entry);
711 if (pte_swp_soft_dirty(*src_pte))
712 pte = pte_swp_mksoft_dirty(pte);
713 if (pte_swp_uffd_wp(*src_pte))
714 pte = pte_swp_mkuffd_wp(pte);
715 set_pte_at(src_mm, addr, src_pte, pte);
716 }
717 } else if (is_device_private_entry(entry)) {
718 page = device_private_entry_to_page(entry);
719
720
721
722
723
724
725
726
727
728
729 get_page(page);
730 rss[mm_counter(page)]++;
731 page_dup_rmap(page, false);
732
733
734
735
736
737
738
739
740 if (is_write_device_private_entry(entry) &&
741 is_cow_mapping(vm_flags)) {
742 make_device_private_entry_read(&entry);
743 pte = swp_entry_to_pte(entry);
744 if (pte_swp_uffd_wp(*src_pte))
745 pte = pte_swp_mkuffd_wp(pte);
746 set_pte_at(src_mm, addr, src_pte, pte);
747 }
748 }
749 set_pte_at(dst_mm, addr, dst_pte, pte);
750 return 0;
751}
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773static inline int
774copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
775 pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
776 struct page **prealloc, pte_t pte, struct page *page)
777{
778 struct page *new_page;
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793 if (likely(!page_needs_cow_for_dma(src_vma, page)))
794 return 1;
795
796 new_page = *prealloc;
797 if (!new_page)
798 return -EAGAIN;
799
800
801
802
803
804 *prealloc = NULL;
805 copy_user_highpage(new_page, page, addr, src_vma);
806 __SetPageUptodate(new_page);
807 page_add_new_anon_rmap(new_page, dst_vma, addr, false);
808 lru_cache_add_inactive_or_unevictable(new_page, dst_vma);
809 rss[mm_counter(new_page)]++;
810
811
812 pte = mk_pte(new_page, dst_vma->vm_page_prot);
813 pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma);
814 set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
815 return 0;
816}
817
818
819
820
821
822static inline int
823copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
824 pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
825 struct page **prealloc)
826{
827 struct mm_struct *src_mm = src_vma->vm_mm;
828 unsigned long vm_flags = src_vma->vm_flags;
829 pte_t pte = *src_pte;
830 struct page *page;
831
832 page = vm_normal_page(src_vma, addr, pte);
833 if (page) {
834 int retval;
835
836 retval = copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
837 addr, rss, prealloc, pte, page);
838 if (retval <= 0)
839 return retval;
840
841 get_page(page);
842 page_dup_rmap(page, false);
843 rss[mm_counter(page)]++;
844 }
845
846
847
848
849
850 if (is_cow_mapping(vm_flags) && pte_write(pte)) {
851 ptep_set_wrprotect(src_mm, addr, src_pte);
852 pte = pte_wrprotect(pte);
853 }
854
855
856
857
858
859 if (vm_flags & VM_SHARED)
860 pte = pte_mkclean(pte);
861 pte = pte_mkold(pte);
862
863
864
865
866
867
868 if (!(vm_flags & VM_UFFD_WP))
869 pte = pte_clear_uffd_wp(pte);
870
871 set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
872 return 0;
873}
874
875static inline struct page *
876page_copy_prealloc(struct mm_struct *src_mm, struct vm_area_struct *vma,
877 unsigned long addr)
878{
879 struct page *new_page;
880
881 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, addr);
882 if (!new_page)
883 return NULL;
884
885 if (mem_cgroup_charge(new_page, src_mm, GFP_KERNEL)) {
886 put_page(new_page);
887 return NULL;
888 }
889 cgroup_throttle_swaprate(new_page, GFP_KERNEL);
890
891 return new_page;
892}
893
894static int
895copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
896 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
897 unsigned long end)
898{
899 struct mm_struct *dst_mm = dst_vma->vm_mm;
900 struct mm_struct *src_mm = src_vma->vm_mm;
901 pte_t *orig_src_pte, *orig_dst_pte;
902 pte_t *src_pte, *dst_pte;
903 spinlock_t *src_ptl, *dst_ptl;
904 int progress, ret = 0;
905 int rss[NR_MM_COUNTERS];
906 swp_entry_t entry = (swp_entry_t){0};
907 struct page *prealloc = NULL;
908
909again:
910 progress = 0;
911 init_rss_vec(rss);
912
913 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
914 if (!dst_pte) {
915 ret = -ENOMEM;
916 goto out;
917 }
918 src_pte = pte_offset_map(src_pmd, addr);
919 src_ptl = pte_lockptr(src_mm, src_pmd);
920 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
921 orig_src_pte = src_pte;
922 orig_dst_pte = dst_pte;
923 arch_enter_lazy_mmu_mode();
924
925 do {
926
927
928
929
930 if (progress >= 32) {
931 progress = 0;
932 if (need_resched() ||
933 spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
934 break;
935 }
936 if (pte_none(*src_pte)) {
937 progress++;
938 continue;
939 }
940 if (unlikely(!pte_present(*src_pte))) {
941 entry.val = copy_nonpresent_pte(dst_mm, src_mm,
942 dst_pte, src_pte,
943 src_vma, addr, rss);
944 if (entry.val)
945 break;
946 progress += 8;
947 continue;
948 }
949
950 ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte,
951 addr, rss, &prealloc);
952
953
954
955
956 if (unlikely(ret == -EAGAIN))
957 break;
958 if (unlikely(prealloc)) {
959
960
961
962
963
964
965 put_page(prealloc);
966 prealloc = NULL;
967 }
968 progress += 8;
969 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
970
971 arch_leave_lazy_mmu_mode();
972 spin_unlock(src_ptl);
973 pte_unmap(orig_src_pte);
974 add_mm_rss_vec(dst_mm, rss);
975 pte_unmap_unlock(orig_dst_pte, dst_ptl);
976 cond_resched();
977
978 if (entry.val) {
979 if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) {
980 ret = -ENOMEM;
981 goto out;
982 }
983 entry.val = 0;
984 } else if (ret) {
985 WARN_ON_ONCE(ret != -EAGAIN);
986 prealloc = page_copy_prealloc(src_mm, src_vma, addr);
987 if (!prealloc)
988 return -ENOMEM;
989
990 ret = 0;
991 }
992 if (addr != end)
993 goto again;
994out:
995 if (unlikely(prealloc))
996 put_page(prealloc);
997 return ret;
998}
999
1000static inline int
1001copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
1002 pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
1003 unsigned long end)
1004{
1005 struct mm_struct *dst_mm = dst_vma->vm_mm;
1006 struct mm_struct *src_mm = src_vma->vm_mm;
1007 pmd_t *src_pmd, *dst_pmd;
1008 unsigned long next;
1009
1010 dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
1011 if (!dst_pmd)
1012 return -ENOMEM;
1013 src_pmd = pmd_offset(src_pud, addr);
1014 do {
1015 next = pmd_addr_end(addr, end);
1016 if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)
1017 || pmd_devmap(*src_pmd)) {
1018 int err;
1019 VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma);
1020 err = copy_huge_pmd(dst_mm, src_mm,
1021 dst_pmd, src_pmd, addr, src_vma);
1022 if (err == -ENOMEM)
1023 return -ENOMEM;
1024 if (!err)
1025 continue;
1026
1027 }
1028 if (pmd_none_or_clear_bad(src_pmd))
1029 continue;
1030 if (copy_pte_range(dst_vma, src_vma, dst_pmd, src_pmd,
1031 addr, next))
1032 return -ENOMEM;
1033 } while (dst_pmd++, src_pmd++, addr = next, addr != end);
1034 return 0;
1035}
1036
1037static inline int
1038copy_pud_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
1039 p4d_t *dst_p4d, p4d_t *src_p4d, unsigned long addr,
1040 unsigned long end)
1041{
1042 struct mm_struct *dst_mm = dst_vma->vm_mm;
1043 struct mm_struct *src_mm = src_vma->vm_mm;
1044 pud_t *src_pud, *dst_pud;
1045 unsigned long next;
1046
1047 dst_pud = pud_alloc(dst_mm, dst_p4d, addr);
1048 if (!dst_pud)
1049 return -ENOMEM;
1050 src_pud = pud_offset(src_p4d, addr);
1051 do {
1052 next = pud_addr_end(addr, end);
1053 if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) {
1054 int err;
1055
1056 VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, src_vma);
1057 err = copy_huge_pud(dst_mm, src_mm,
1058 dst_pud, src_pud, addr, src_vma);
1059 if (err == -ENOMEM)
1060 return -ENOMEM;
1061 if (!err)
1062 continue;
1063
1064 }
1065 if (pud_none_or_clear_bad(src_pud))
1066 continue;
1067 if (copy_pmd_range(dst_vma, src_vma, dst_pud, src_pud,
1068 addr, next))
1069 return -ENOMEM;
1070 } while (dst_pud++, src_pud++, addr = next, addr != end);
1071 return 0;
1072}
1073
1074static inline int
1075copy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
1076 pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long addr,
1077 unsigned long end)
1078{
1079 struct mm_struct __maybe_unused *dst_mm = dst_vma->vm_mm;
1080 p4d_t *src_p4d, *dst_p4d;
1081 unsigned long next;
1082
1083 dst_p4d = p4d_alloc(dst_mm, dst_pgd, addr);
1084 if (!dst_p4d)
1085 return -ENOMEM;
1086 src_p4d = p4d_offset(src_pgd, addr);
1087 do {
1088 next = p4d_addr_end(addr, end);
1089 if (p4d_none_or_clear_bad(src_p4d))
1090 continue;
1091 if (copy_pud_range(dst_vma, src_vma, dst_p4d, src_p4d,
1092 addr, next))
1093 return -ENOMEM;
1094 } while (dst_p4d++, src_p4d++, addr = next, addr != end);
1095 return 0;
1096}
1097
1098int
1099copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
1100{
1101 pgd_t *src_pgd, *dst_pgd;
1102 unsigned long next;
1103 unsigned long addr = src_vma->vm_start;
1104 unsigned long end = src_vma->vm_end;
1105 struct mm_struct *dst_mm = dst_vma->vm_mm;
1106 struct mm_struct *src_mm = src_vma->vm_mm;
1107 struct mmu_notifier_range range;
1108 bool is_cow;
1109 int ret;
1110
1111
1112
1113
1114
1115
1116
1117 if (!(src_vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) &&
1118 !src_vma->anon_vma)
1119 return 0;
1120
1121 if (is_vm_hugetlb_page(src_vma))
1122 return copy_hugetlb_page_range(dst_mm, src_mm, src_vma);
1123
1124 if (unlikely(src_vma->vm_flags & VM_PFNMAP)) {
1125
1126
1127
1128
1129 ret = track_pfn_copy(src_vma);
1130 if (ret)
1131 return ret;
1132 }
1133
1134
1135
1136
1137
1138
1139
1140 is_cow = is_cow_mapping(src_vma->vm_flags);
1141
1142 if (is_cow) {
1143 mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
1144 0, src_vma, src_mm, addr, end);
1145 mmu_notifier_invalidate_range_start(&range);
1146
1147
1148
1149
1150
1151
1152
1153 mmap_assert_write_locked(src_mm);
1154 raw_write_seqcount_begin(&src_mm->write_protect_seq);
1155 }
1156
1157 ret = 0;
1158 dst_pgd = pgd_offset(dst_mm, addr);
1159 src_pgd = pgd_offset(src_mm, addr);
1160 do {
1161 next = pgd_addr_end(addr, end);
1162 if (pgd_none_or_clear_bad(src_pgd))
1163 continue;
1164 if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd,
1165 addr, next))) {
1166 ret = -ENOMEM;
1167 break;
1168 }
1169 } while (dst_pgd++, src_pgd++, addr = next, addr != end);
1170
1171 if (is_cow) {
1172 raw_write_seqcount_end(&src_mm->write_protect_seq);
1173 mmu_notifier_invalidate_range_end(&range);
1174 }
1175 return ret;
1176}
1177
1178static unsigned long zap_pte_range(struct mmu_gather *tlb,
1179 struct vm_area_struct *vma, pmd_t *pmd,
1180 unsigned long addr, unsigned long end,
1181 struct zap_details *details)
1182{
1183 struct mm_struct *mm = tlb->mm;
1184 int force_flush = 0;
1185 int rss[NR_MM_COUNTERS];
1186 spinlock_t *ptl;
1187 pte_t *start_pte;
1188 pte_t *pte;
1189 swp_entry_t entry;
1190
1191 tlb_change_page_size(tlb, PAGE_SIZE);
1192again:
1193 init_rss_vec(rss);
1194 start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
1195 pte = start_pte;
1196 flush_tlb_batched_pending(mm);
1197 arch_enter_lazy_mmu_mode();
1198 do {
1199 pte_t ptent = *pte;
1200 if (pte_none(ptent))
1201 continue;
1202
1203 if (pte_present(ptent)) {
1204 struct page *page;
1205
1206 page = vm_normal_page(vma, addr, ptent);
1207 if (unlikely(details) && page) {
1208
1209
1210
1211
1212
1213 if (details->check_mapping &&
1214 details->check_mapping != page_rmapping(page))
1215 continue;
1216 }
1217 ptent = ptep_get_and_clear_full(mm, addr, pte,
1218 tlb->fullmm);
1219 tlb_remove_tlb_entry(tlb, pte, addr);
1220 if (unlikely(!page))
1221 continue;
1222
1223 if (!PageAnon(page)) {
1224 if (pte_dirty(ptent)) {
1225 force_flush = 1;
1226 set_page_dirty(page);
1227 }
1228 if (pte_young(ptent) &&
1229 likely(!(vma->vm_flags & VM_SEQ_READ)))
1230 mark_page_accessed(page);
1231 }
1232 rss[mm_counter(page)]--;
1233 page_remove_rmap(page, false);
1234 if (unlikely(page_mapcount(page) < 0))
1235 print_bad_pte(vma, addr, ptent, page);
1236 if (unlikely(__tlb_remove_page(tlb, page))) {
1237 force_flush = 1;
1238 addr += PAGE_SIZE;
1239 break;
1240 }
1241 continue;
1242 }
1243
1244 entry = pte_to_swp_entry(ptent);
1245 if (non_swap_entry(entry) && is_device_private_entry(entry)) {
1246 struct page *page = device_private_entry_to_page(entry);
1247
1248 if (unlikely(details && details->check_mapping)) {
1249
1250
1251
1252
1253
1254 if (details->check_mapping !=
1255 page_rmapping(page))
1256 continue;
1257 }
1258
1259 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
1260 rss[mm_counter(page)]--;
1261 page_remove_rmap(page, false);
1262 put_page(page);
1263 continue;
1264 }
1265
1266
1267 if (unlikely(details))
1268 continue;
1269
1270 entry = pte_to_swp_entry(ptent);
1271 if (!non_swap_entry(entry))
1272 rss[MM_SWAPENTS]--;
1273 else if (is_migration_entry(entry)) {
1274 struct page *page;
1275
1276 page = migration_entry_to_page(entry);
1277 rss[mm_counter(page)]--;
1278 }
1279 if (unlikely(!free_swap_and_cache(entry)))
1280 print_bad_pte(vma, addr, ptent, NULL);
1281 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
1282 } while (pte++, addr += PAGE_SIZE, addr != end);
1283
1284 add_mm_rss_vec(mm, rss);
1285 arch_leave_lazy_mmu_mode();
1286
1287
1288 if (force_flush)
1289 tlb_flush_mmu_tlbonly(tlb);
1290 pte_unmap_unlock(start_pte, ptl);
1291
1292
1293
1294
1295
1296
1297
1298 if (force_flush) {
1299 force_flush = 0;
1300 tlb_flush_mmu(tlb);
1301 if (addr != end)
1302 goto again;
1303 }
1304
1305 return addr;
1306}
1307
1308static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1309 struct vm_area_struct *vma, pud_t *pud,
1310 unsigned long addr, unsigned long end,
1311 struct zap_details *details)
1312{
1313 pmd_t *pmd;
1314 unsigned long next;
1315
1316 pmd = pmd_offset(pud, addr);
1317 do {
1318 next = pmd_addr_end(addr, end);
1319 if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
1320 if (next - addr != HPAGE_PMD_SIZE)
1321 __split_huge_pmd(vma, pmd, addr, false, NULL);
1322 else if (zap_huge_pmd(tlb, vma, pmd, addr))
1323 goto next;
1324
1325 }
1326
1327
1328
1329
1330
1331
1332
1333 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
1334 goto next;
1335 next = zap_pte_range(tlb, vma, pmd, addr, next, details);
1336next:
1337 cond_resched();
1338 } while (pmd++, addr = next, addr != end);
1339
1340 return addr;
1341}
1342
1343static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1344 struct vm_area_struct *vma, p4d_t *p4d,
1345 unsigned long addr, unsigned long end,
1346 struct zap_details *details)
1347{
1348 pud_t *pud;
1349 unsigned long next;
1350
1351 pud = pud_offset(p4d, addr);
1352 do {
1353 next = pud_addr_end(addr, end);
1354 if (pud_trans_huge(*pud) || pud_devmap(*pud)) {
1355 if (next - addr != HPAGE_PUD_SIZE) {
1356 mmap_assert_locked(tlb->mm);
1357 split_huge_pud(vma, pud, addr);
1358 } else if (zap_huge_pud(tlb, vma, pud, addr))
1359 goto next;
1360
1361 }
1362 if (pud_none_or_clear_bad(pud))
1363 continue;
1364 next = zap_pmd_range(tlb, vma, pud, addr, next, details);
1365next:
1366 cond_resched();
1367 } while (pud++, addr = next, addr != end);
1368
1369 return addr;
1370}
1371
1372static inline unsigned long zap_p4d_range(struct mmu_gather *tlb,
1373 struct vm_area_struct *vma, pgd_t *pgd,
1374 unsigned long addr, unsigned long end,
1375 struct zap_details *details)
1376{
1377 p4d_t *p4d;
1378 unsigned long next;
1379
1380 p4d = p4d_offset(pgd, addr);
1381 do {
1382 next = p4d_addr_end(addr, end);
1383 if (p4d_none_or_clear_bad(p4d))
1384 continue;
1385 next = zap_pud_range(tlb, vma, p4d, addr, next, details);
1386 } while (p4d++, addr = next, addr != end);
1387
1388 return addr;
1389}
1390
1391void unmap_page_range(struct mmu_gather *tlb,
1392 struct vm_area_struct *vma,
1393 unsigned long addr, unsigned long end,
1394 struct zap_details *details)
1395{
1396 pgd_t *pgd;
1397 unsigned long next;
1398
1399 BUG_ON(addr >= end);
1400 tlb_start_vma(tlb, vma);
1401 pgd = pgd_offset(vma->vm_mm, addr);
1402 do {
1403 next = pgd_addr_end(addr, end);
1404 if (pgd_none_or_clear_bad(pgd))
1405 continue;
1406 next = zap_p4d_range(tlb, vma, pgd, addr, next, details);
1407 } while (pgd++, addr = next, addr != end);
1408 tlb_end_vma(tlb, vma);
1409}
1410
1411
1412static void unmap_single_vma(struct mmu_gather *tlb,
1413 struct vm_area_struct *vma, unsigned long start_addr,
1414 unsigned long end_addr,
1415 struct zap_details *details)
1416{
1417 unsigned long start = max(vma->vm_start, start_addr);
1418 unsigned long end;
1419
1420 if (start >= vma->vm_end)
1421 return;
1422 end = min(vma->vm_end, end_addr);
1423 if (end <= vma->vm_start)
1424 return;
1425
1426 if (vma->vm_file)
1427 uprobe_munmap(vma, start, end);
1428
1429 if (unlikely(vma->vm_flags & VM_PFNMAP))
1430 untrack_pfn(vma, 0, 0);
1431
1432 if (start != end) {
1433 if (unlikely(is_vm_hugetlb_page(vma))) {
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445 if (vma->vm_file) {
1446 i_mmap_lock_write(vma->vm_file->f_mapping);
1447 __unmap_hugepage_range_final(tlb, vma, start, end, NULL);
1448 i_mmap_unlock_write(vma->vm_file->f_mapping);
1449 }
1450 } else
1451 unmap_page_range(tlb, vma, start, end, details);
1452 }
1453}
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473void unmap_vmas(struct mmu_gather *tlb,
1474 struct vm_area_struct *vma, unsigned long start_addr,
1475 unsigned long end_addr)
1476{
1477 struct mmu_notifier_range range;
1478
1479 mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
1480 start_addr, end_addr);
1481 mmu_notifier_invalidate_range_start(&range);
1482 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
1483 unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
1484 mmu_notifier_invalidate_range_end(&range);
1485}
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495void zap_page_range(struct vm_area_struct *vma, unsigned long start,
1496 unsigned long size)
1497{
1498 struct mmu_notifier_range range;
1499 struct mmu_gather tlb;
1500
1501 lru_add_drain();
1502 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
1503 start, start + size);
1504 tlb_gather_mmu(&tlb, vma->vm_mm, start, range.end);
1505 update_hiwater_rss(vma->vm_mm);
1506 mmu_notifier_invalidate_range_start(&range);
1507 for ( ; vma && vma->vm_start < range.end; vma = vma->vm_next)
1508 unmap_single_vma(&tlb, vma, start, range.end, NULL);
1509 mmu_notifier_invalidate_range_end(&range);
1510 tlb_finish_mmu(&tlb, start, range.end);
1511}
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
1523 unsigned long size, struct zap_details *details)
1524{
1525 struct mmu_notifier_range range;
1526 struct mmu_gather tlb;
1527
1528 lru_add_drain();
1529 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
1530 address, address + size);
1531 tlb_gather_mmu(&tlb, vma->vm_mm, address, range.end);
1532 update_hiwater_rss(vma->vm_mm);
1533 mmu_notifier_invalidate_range_start(&range);
1534 unmap_single_vma(&tlb, vma, address, range.end, details);
1535 mmu_notifier_invalidate_range_end(&range);
1536 tlb_finish_mmu(&tlb, address, range.end);
1537}
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
1551 unsigned long size)
1552{
1553 if (address < vma->vm_start || address + size > vma->vm_end ||
1554 !(vma->vm_flags & VM_PFNMAP))
1555 return;
1556
1557 zap_page_range_single(vma, address, size, NULL);
1558}
1559EXPORT_SYMBOL_GPL(zap_vma_ptes);
1560
1561pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
1562 spinlock_t **ptl)
1563{
1564 pgd_t *pgd;
1565 p4d_t *p4d;
1566 pud_t *pud;
1567 pmd_t *pmd;
1568
1569 pgd = pgd_offset(mm, addr);
1570 p4d = p4d_alloc(mm, pgd, addr);
1571 if (!p4d)
1572 return NULL;
1573 pud = pud_alloc(mm, p4d, addr);
1574 if (!pud)
1575 return NULL;
1576 pmd = pmd_alloc(mm, pud, addr);
1577 if (!pmd)
1578 return NULL;
1579
1580 VM_BUG_ON(pmd_trans_huge(*pmd));
1581 return pte_alloc_map_lock(mm, pmd, addr, ptl);
1582}
1583
1584
1585
1586
1587
1588
1589
1590
1591static int insert_page(struct vm_area_struct *vma, unsigned long addr,
1592 struct page *page, pgprot_t prot)
1593{
1594 struct mm_struct *mm = vma->vm_mm;
1595 int retval;
1596 pte_t *pte;
1597 spinlock_t *ptl;
1598
1599 retval = -EINVAL;
1600 if (PageAnon(page))
1601 goto out;
1602 retval = -ENOMEM;
1603 flush_dcache_page(page);
1604 pte = get_locked_pte(mm, addr, &ptl);
1605 if (!pte)
1606 goto out;
1607 retval = -EBUSY;
1608 if (!pte_none(*pte))
1609 goto out_unlock;
1610
1611
1612 get_page(page);
1613 inc_mm_counter_fast(mm, mm_counter_file(page));
1614 page_add_file_rmap(page, false);
1615 set_pte_at(mm, addr, pte, mk_pte(page, prot));
1616
1617 retval = 0;
1618 pte_unmap_unlock(pte, ptl);
1619 return retval;
1620out_unlock:
1621 pte_unmap_unlock(pte, ptl);
1622out:
1623 return retval;
1624}
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
1656 struct page *page)
1657{
1658 if (addr < vma->vm_start || addr >= vma->vm_end)
1659 return -EFAULT;
1660 if (!page_count(page))
1661 return -EINVAL;
1662 if (!(vma->vm_flags & VM_MIXEDMAP)) {
1663 BUG_ON(mmap_read_trylock(vma->vm_mm));
1664 BUG_ON(vma->vm_flags & VM_PFNMAP);
1665 vma->vm_flags |= VM_MIXEDMAP;
1666 }
1667 return insert_page(vma, addr, page, vma->vm_page_prot);
1668}
1669EXPORT_SYMBOL(vm_insert_page);
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682static int __vm_map_pages(struct vm_area_struct *vma, struct page **pages,
1683 unsigned long num, unsigned long offset)
1684{
1685 unsigned long count = vma_pages(vma);
1686 unsigned long uaddr = vma->vm_start;
1687 int ret, i;
1688
1689
1690 if (offset > num)
1691 return -ENXIO;
1692
1693
1694 if (count > num - offset)
1695 return -ENXIO;
1696
1697 for (i = 0; i < count; i++) {
1698 ret = vm_insert_page(vma, uaddr, pages[offset + i]);
1699 if (ret < 0)
1700 return ret;
1701 uaddr += PAGE_SIZE;
1702 }
1703
1704 return 0;
1705}
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
1726 unsigned long num)
1727{
1728 return __vm_map_pages(vma, pages, num, vma->vm_pgoff);
1729}
1730EXPORT_SYMBOL(vm_map_pages);
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
1746 unsigned long num)
1747{
1748 return __vm_map_pages(vma, pages, num, 0);
1749}
1750EXPORT_SYMBOL(vm_map_pages_zero);
1751
1752static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1753 pfn_t pfn, pgprot_t prot, bool mkwrite)
1754{
1755 struct mm_struct *mm = vma->vm_mm;
1756 int retval;
1757 pte_t *pte, entry;
1758 spinlock_t *ptl;
1759
1760 retval = -ENOMEM;
1761 pte = get_locked_pte(mm, addr, &ptl);
1762 if (!pte)
1763 goto out;
1764 retval = -EBUSY;
1765 if (!pte_none(*pte)) {
1766 if (mkwrite) {
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777 if (pte_pfn(*pte) != pfn_t_to_pfn(pfn)) {
1778 WARN_ON_ONCE(!is_zero_pfn(pte_pfn(*pte)));
1779 goto out_unlock;
1780 }
1781 entry = pte_mkyoung(*pte);
1782 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1783 if (ptep_set_access_flags(vma, addr, pte, entry, 1))
1784 update_mmu_cache(vma, addr, pte);
1785 }
1786 goto out_unlock;
1787 }
1788
1789
1790 if (pfn_t_devmap(pfn))
1791 entry = pte_mkdevmap(pfn_t_pte(pfn, prot));
1792 else
1793 entry = pte_mkspecial(pfn_t_pte(pfn, prot));
1794
1795 if (mkwrite) {
1796 entry = pte_mkyoung(entry);
1797 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1798 }
1799
1800 set_pte_at(mm, addr, pte, entry);
1801 update_mmu_cache(vma, addr, pte);
1802
1803 retval = 0;
1804out_unlock:
1805 pte_unmap_unlock(pte, ptl);
1806out:
1807 return retval;
1808}
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
1832 unsigned long pfn, pgprot_t pgprot)
1833{
1834 int err;
1835
1836
1837
1838
1839
1840
1841
1842 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
1843 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
1844 (VM_PFNMAP|VM_MIXEDMAP));
1845 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
1846 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
1847
1848 if (addr < vma->vm_start || addr >= vma->vm_end)
1849 return VM_FAULT_SIGBUS;
1850
1851 if (!pfn_modify_allowed(pfn, pgprot))
1852 return VM_FAULT_SIGBUS;
1853
1854 track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
1855
1856 err = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
1857 false);
1858
1859 if (err == -ENOMEM)
1860 return VM_FAULT_OOM;
1861 if (err < 0 && err != -EBUSY)
1862 return VM_FAULT_SIGBUS;
1863
1864 return VM_FAULT_NOPAGE;
1865}
1866EXPORT_SYMBOL(vmf_insert_pfn_prot);
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1889 unsigned long pfn)
1890{
1891 return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
1892}
1893EXPORT_SYMBOL(vmf_insert_pfn);
1894
1895static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
1896{
1897
1898 if (vma->vm_flags & VM_MIXEDMAP)
1899 return true;
1900 if (pfn_t_devmap(pfn))
1901 return true;
1902 if (pfn_t_special(pfn))
1903 return true;
1904 if (is_zero_pfn(pfn_t_to_pfn(pfn)))
1905 return true;
1906 return false;
1907}
1908
1909static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
1910 unsigned long addr, pfn_t pfn, pgprot_t pgprot,
1911 bool mkwrite)
1912{
1913 int err;
1914
1915 BUG_ON(!vm_mixed_ok(vma, pfn));
1916
1917 if (addr < vma->vm_start || addr >= vma->vm_end)
1918 return VM_FAULT_SIGBUS;
1919
1920 track_pfn_insert(vma, &pgprot, pfn);
1921
1922 if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot))
1923 return VM_FAULT_SIGBUS;
1924
1925
1926
1927
1928
1929
1930
1931
1932 if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) &&
1933 !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) {
1934 struct page *page;
1935
1936
1937
1938
1939
1940
1941 page = pfn_to_page(pfn_t_to_pfn(pfn));
1942 err = insert_page(vma, addr, page, pgprot);
1943 } else {
1944 err = insert_pfn(vma, addr, pfn, pgprot, mkwrite);
1945 }
1946
1947 if (err == -ENOMEM)
1948 return VM_FAULT_OOM;
1949 if (err < 0 && err != -EBUSY)
1950 return VM_FAULT_SIGBUS;
1951
1952 return VM_FAULT_NOPAGE;
1953}
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981vm_fault_t vmf_insert_mixed_prot(struct vm_area_struct *vma, unsigned long addr,
1982 pfn_t pfn, pgprot_t pgprot)
1983{
1984 return __vm_insert_mixed(vma, addr, pfn, pgprot, false);
1985}
1986EXPORT_SYMBOL(vmf_insert_mixed_prot);
1987
1988vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
1989 pfn_t pfn)
1990{
1991 return __vm_insert_mixed(vma, addr, pfn, vma->vm_page_prot, false);
1992}
1993EXPORT_SYMBOL(vmf_insert_mixed);
1994
1995
1996
1997
1998
1999
2000vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
2001 unsigned long addr, pfn_t pfn)
2002{
2003 return __vm_insert_mixed(vma, addr, pfn, vma->vm_page_prot, true);
2004}
2005EXPORT_SYMBOL(vmf_insert_mixed_mkwrite);
2006
2007
2008
2009
2010
2011
2012static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
2013 unsigned long addr, unsigned long end,
2014 unsigned long pfn, pgprot_t prot)
2015{
2016 pte_t *pte, *mapped_pte;
2017 spinlock_t *ptl;
2018 int err = 0;
2019
2020 mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
2021 if (!pte)
2022 return -ENOMEM;
2023 arch_enter_lazy_mmu_mode();
2024 do {
2025 BUG_ON(!pte_none(*pte));
2026 if (!pfn_modify_allowed(pfn, prot)) {
2027 err = -EACCES;
2028 break;
2029 }
2030 set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
2031 pfn++;
2032 } while (pte++, addr += PAGE_SIZE, addr != end);
2033 arch_leave_lazy_mmu_mode();
2034 pte_unmap_unlock(mapped_pte, ptl);
2035 return err;
2036}
2037
2038static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
2039 unsigned long addr, unsigned long end,
2040 unsigned long pfn, pgprot_t prot)
2041{
2042 pmd_t *pmd;
2043 unsigned long next;
2044 int err;
2045
2046 pfn -= addr >> PAGE_SHIFT;
2047 pmd = pmd_alloc(mm, pud, addr);
2048 if (!pmd)
2049 return -ENOMEM;
2050 VM_BUG_ON(pmd_trans_huge(*pmd));
2051 do {
2052 next = pmd_addr_end(addr, end);
2053 err = remap_pte_range(mm, pmd, addr, next,
2054 pfn + (addr >> PAGE_SHIFT), prot);
2055 if (err)
2056 return err;
2057 } while (pmd++, addr = next, addr != end);
2058 return 0;
2059}
2060
2061static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d,
2062 unsigned long addr, unsigned long end,
2063 unsigned long pfn, pgprot_t prot)
2064{
2065 pud_t *pud;
2066 unsigned long next;
2067 int err;
2068
2069 pfn -= addr >> PAGE_SHIFT;
2070 pud = pud_alloc(mm, p4d, addr);
2071 if (!pud)
2072 return -ENOMEM;
2073 do {
2074 next = pud_addr_end(addr, end);
2075 err = remap_pmd_range(mm, pud, addr, next,
2076 pfn + (addr >> PAGE_SHIFT), prot);
2077 if (err)
2078 return err;
2079 } while (pud++, addr = next, addr != end);
2080 return 0;
2081}
2082
2083static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
2084 unsigned long addr, unsigned long end,
2085 unsigned long pfn, pgprot_t prot)
2086{
2087 p4d_t *p4d;
2088 unsigned long next;
2089 int err;
2090
2091 pfn -= addr >> PAGE_SHIFT;
2092 p4d = p4d_alloc(mm, pgd, addr);
2093 if (!p4d)
2094 return -ENOMEM;
2095 do {
2096 next = p4d_addr_end(addr, end);
2097 err = remap_pud_range(mm, p4d, addr, next,
2098 pfn + (addr >> PAGE_SHIFT), prot);
2099 if (err)
2100 return err;
2101 } while (p4d++, addr = next, addr != end);
2102 return 0;
2103}
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
2118 unsigned long pfn, unsigned long size, pgprot_t prot)
2119{
2120 pgd_t *pgd;
2121 unsigned long next;
2122 unsigned long end = addr + PAGE_ALIGN(size);
2123 struct mm_struct *mm = vma->vm_mm;
2124 unsigned long remap_pfn = pfn;
2125 int err;
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145 if (is_cow_mapping(vma->vm_flags)) {
2146 if (addr != vma->vm_start || end != vma->vm_end)
2147 return -EINVAL;
2148 vma->vm_pgoff = pfn;
2149 }
2150
2151 err = track_pfn_remap(vma, &prot, remap_pfn, addr, PAGE_ALIGN(size));
2152 if (err)
2153 return -EINVAL;
2154
2155 vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
2156
2157 BUG_ON(addr >= end);
2158 pfn -= addr >> PAGE_SHIFT;
2159 pgd = pgd_offset(mm, addr);
2160 flush_cache_range(vma, addr, end);
2161 do {
2162 next = pgd_addr_end(addr, end);
2163 err = remap_p4d_range(mm, pgd, addr, next,
2164 pfn + (addr >> PAGE_SHIFT), prot);
2165 if (err)
2166 break;
2167 } while (pgd++, addr = next, addr != end);
2168
2169 if (err)
2170 untrack_pfn(vma, remap_pfn, PAGE_ALIGN(size));
2171
2172 return err;
2173}
2174EXPORT_SYMBOL(remap_pfn_range);
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
2192{
2193 unsigned long vm_len, pfn, pages;
2194
2195
2196 if (start + len < start)
2197 return -EINVAL;
2198
2199
2200
2201
2202
2203 len += start & ~PAGE_MASK;
2204 pfn = start >> PAGE_SHIFT;
2205 pages = (len + ~PAGE_MASK) >> PAGE_SHIFT;
2206 if (pfn + pages < pfn)
2207 return -EINVAL;
2208
2209
2210 if (vma->vm_pgoff > pages)
2211 return -EINVAL;
2212 pfn += vma->vm_pgoff;
2213 pages -= vma->vm_pgoff;
2214
2215
2216 vm_len = vma->vm_end - vma->vm_start;
2217 if (vm_len >> PAGE_SHIFT > pages)
2218 return -EINVAL;
2219
2220
2221 return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
2222}
2223EXPORT_SYMBOL(vm_iomap_memory);
2224
2225static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
2226 unsigned long addr, unsigned long end,
2227 pte_fn_t fn, void *data, bool create)
2228{
2229 pte_t *pte, *mapped_pte;
2230 int err = 0;
2231 spinlock_t *uninitialized_var(ptl);
2232
2233 if (create) {
2234 mapped_pte = pte = (mm == &init_mm) ?
2235 pte_alloc_kernel(pmd, addr) :
2236 pte_alloc_map_lock(mm, pmd, addr, &ptl);
2237 if (!pte)
2238 return -ENOMEM;
2239 } else {
2240 mapped_pte = pte = (mm == &init_mm) ?
2241 pte_offset_kernel(pmd, addr) :
2242 pte_offset_map_lock(mm, pmd, addr, &ptl);
2243 }
2244
2245 BUG_ON(pmd_huge(*pmd));
2246
2247 arch_enter_lazy_mmu_mode();
2248
2249 if (fn) {
2250 do {
2251 if (create || !pte_none(*pte)) {
2252 err = fn(pte++, addr, data);
2253 if (err)
2254 break;
2255 }
2256 } while (addr += PAGE_SIZE, addr != end);
2257 }
2258
2259 arch_leave_lazy_mmu_mode();
2260
2261 if (mm != &init_mm)
2262 pte_unmap_unlock(mapped_pte, ptl);
2263 return err;
2264}
2265
2266static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
2267 unsigned long addr, unsigned long end,
2268 pte_fn_t fn, void *data, bool create)
2269{
2270 pmd_t *pmd;
2271 unsigned long next;
2272 int err = 0;
2273
2274 BUG_ON(pud_huge(*pud));
2275
2276 if (create) {
2277 pmd = pmd_alloc(mm, pud, addr);
2278 if (!pmd)
2279 return -ENOMEM;
2280 } else {
2281 pmd = pmd_offset(pud, addr);
2282 }
2283 do {
2284 next = pmd_addr_end(addr, end);
2285 if (create || !pmd_none_or_clear_bad(pmd)) {
2286 err = apply_to_pte_range(mm, pmd, addr, next, fn, data,
2287 create);
2288 if (err)
2289 break;
2290 }
2291 } while (pmd++, addr = next, addr != end);
2292 return err;
2293}
2294
2295static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
2296 unsigned long addr, unsigned long end,
2297 pte_fn_t fn, void *data, bool create)
2298{
2299 pud_t *pud;
2300 unsigned long next;
2301 int err = 0;
2302
2303 if (create) {
2304 pud = pud_alloc(mm, p4d, addr);
2305 if (!pud)
2306 return -ENOMEM;
2307 } else {
2308 pud = pud_offset(p4d, addr);
2309 }
2310 do {
2311 next = pud_addr_end(addr, end);
2312 if (create || !pud_none_or_clear_bad(pud)) {
2313 err = apply_to_pmd_range(mm, pud, addr, next, fn, data,
2314 create);
2315 if (err)
2316 break;
2317 }
2318 } while (pud++, addr = next, addr != end);
2319 return err;
2320}
2321
2322static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
2323 unsigned long addr, unsigned long end,
2324 pte_fn_t fn, void *data, bool create)
2325{
2326 p4d_t *p4d;
2327 unsigned long next;
2328 int err = 0;
2329
2330 if (create) {
2331 p4d = p4d_alloc(mm, pgd, addr);
2332 if (!p4d)
2333 return -ENOMEM;
2334 } else {
2335 p4d = p4d_offset(pgd, addr);
2336 }
2337 do {
2338 next = p4d_addr_end(addr, end);
2339 if (create || !p4d_none_or_clear_bad(p4d)) {
2340 err = apply_to_pud_range(mm, p4d, addr, next, fn, data,
2341 create);
2342 if (err)
2343 break;
2344 }
2345 } while (p4d++, addr = next, addr != end);
2346 return err;
2347}
2348
2349static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
2350 unsigned long size, pte_fn_t fn,
2351 void *data, bool create)
2352{
2353 pgd_t *pgd;
2354 unsigned long next;
2355 unsigned long end = addr + size;
2356 int err = 0;
2357
2358 if (WARN_ON(addr >= end))
2359 return -EINVAL;
2360
2361 pgd = pgd_offset(mm, addr);
2362 do {
2363 next = pgd_addr_end(addr, end);
2364 if (!create && pgd_none_or_clear_bad(pgd))
2365 continue;
2366 err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, create);
2367 if (err)
2368 break;
2369 } while (pgd++, addr = next, addr != end);
2370
2371 return err;
2372}
2373
2374
2375
2376
2377
2378int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
2379 unsigned long size, pte_fn_t fn, void *data)
2380{
2381 return __apply_to_page_range(mm, addr, size, fn, data, true);
2382}
2383EXPORT_SYMBOL_GPL(apply_to_page_range);
2384
2385
2386
2387
2388
2389
2390
2391
2392int apply_to_existing_page_range(struct mm_struct *mm, unsigned long addr,
2393 unsigned long size, pte_fn_t fn, void *data)
2394{
2395 return __apply_to_page_range(mm, addr, size, fn, data, false);
2396}
2397EXPORT_SYMBOL_GPL(apply_to_existing_page_range);
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
2408 pte_t *page_table, pte_t orig_pte)
2409{
2410 int same = 1;
2411#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPTION)
2412 if (sizeof(pte_t) > sizeof(unsigned long)) {
2413 spinlock_t *ptl = pte_lockptr(mm, pmd);
2414 spin_lock(ptl);
2415 same = pte_same(*page_table, orig_pte);
2416 spin_unlock(ptl);
2417 }
2418#endif
2419 pte_unmap(page_table);
2420 return same;
2421}
2422
2423static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
2424{
2425
2426
2427
2428
2429
2430
2431 if (unlikely(!src)) {
2432 void *kaddr = kmap_atomic(dst);
2433 void __user *uaddr = (void __user *)(va & PAGE_MASK);
2434
2435
2436
2437
2438
2439
2440
2441 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
2442 clear_page(kaddr);
2443 kunmap_atomic(kaddr);
2444 flush_dcache_page(dst);
2445 } else
2446 copy_user_highpage(dst, src, va, vma);
2447}
2448
2449static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
2450{
2451 struct file *vm_file = vma->vm_file;
2452
2453 if (vm_file)
2454 return mapping_gfp_mask(vm_file->f_mapping) | __GFP_FS | __GFP_IO;
2455
2456
2457
2458
2459
2460 return GFP_KERNEL;
2461}
2462
2463
2464
2465
2466
2467
2468
2469static vm_fault_t do_page_mkwrite(struct vm_fault *vmf)
2470{
2471 vm_fault_t ret;
2472 struct page *page = vmf->page;
2473 unsigned int old_flags = vmf->flags;
2474
2475 vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
2476
2477 if (vmf->vma->vm_file &&
2478 IS_SWAPFILE(vmf->vma->vm_file->f_mapping->host))
2479 return VM_FAULT_SIGBUS;
2480
2481 ret = vmf->vma->vm_ops->page_mkwrite(vmf);
2482
2483 vmf->flags = old_flags;
2484 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
2485 return ret;
2486 if (unlikely(!(ret & VM_FAULT_LOCKED))) {
2487 lock_page(page);
2488 if (!page->mapping) {
2489 unlock_page(page);
2490 return 0;
2491 }
2492 ret |= VM_FAULT_LOCKED;
2493 } else
2494 VM_BUG_ON_PAGE(!PageLocked(page), page);
2495 return ret;
2496}
2497
2498
2499
2500
2501
2502
2503static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf)
2504{
2505 struct vm_area_struct *vma = vmf->vma;
2506 struct address_space *mapping;
2507 struct page *page = vmf->page;
2508 bool dirtied;
2509 bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;
2510
2511 dirtied = set_page_dirty(page);
2512 VM_BUG_ON_PAGE(PageAnon(page), page);
2513
2514
2515
2516
2517
2518
2519 mapping = page_rmapping(page);
2520 unlock_page(page);
2521
2522 if (!page_mkwrite)
2523 file_update_time(vma->vm_file);
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534 if ((dirtied || page_mkwrite) && mapping) {
2535 struct file *fpin;
2536
2537 fpin = maybe_unlock_mmap_for_io(vmf, NULL);
2538 balance_dirty_pages_ratelimited(mapping);
2539 if (fpin) {
2540 fput(fpin);
2541 return VM_FAULT_RETRY;
2542 }
2543 }
2544
2545 return 0;
2546}
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556static inline void wp_page_reuse(struct vm_fault *vmf)
2557 __releases(vmf->ptl)
2558{
2559 struct vm_area_struct *vma = vmf->vma;
2560 struct page *page = vmf->page;
2561 pte_t entry;
2562
2563
2564
2565
2566
2567 if (page)
2568 page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);
2569
2570 flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
2571 entry = pte_mkyoung(vmf->orig_pte);
2572 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2573 if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
2574 update_mmu_cache(vma, vmf->address, vmf->pte);
2575 pte_unmap_unlock(vmf->pte, vmf->ptl);
2576}
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594static vm_fault_t wp_page_copy(struct vm_fault *vmf)
2595{
2596 struct vm_area_struct *vma = vmf->vma;
2597 struct mm_struct *mm = vma->vm_mm;
2598 struct page *old_page = vmf->page;
2599 struct page *new_page = NULL;
2600 pte_t entry;
2601 int page_copied = 0;
2602 struct mmu_notifier_range range;
2603
2604 if (unlikely(anon_vma_prepare(vma)))
2605 goto oom;
2606
2607 if (is_zero_pfn(pte_pfn(vmf->orig_pte))) {
2608 new_page = alloc_zeroed_user_highpage_movable(vma,
2609 vmf->address);
2610 if (!new_page)
2611 goto oom;
2612 } else {
2613 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
2614 vmf->address);
2615 if (!new_page)
2616 goto oom;
2617 cow_user_page(new_page, old_page, vmf->address, vma);
2618 }
2619
2620 if (mem_cgroup_charge(new_page, mm, GFP_KERNEL))
2621 goto oom_free_new;
2622 cgroup_throttle_swaprate(new_page, GFP_KERNEL);
2623
2624 __SetPageUptodate(new_page);
2625
2626 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
2627 vmf->address & PAGE_MASK,
2628 (vmf->address & PAGE_MASK) + PAGE_SIZE);
2629 mmu_notifier_invalidate_range_start(&range);
2630
2631
2632
2633
2634 vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
2635 if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
2636 if (old_page) {
2637 if (!PageAnon(old_page)) {
2638 dec_mm_counter_fast(mm,
2639 mm_counter_file(old_page));
2640 inc_mm_counter_fast(mm, MM_ANONPAGES);
2641 }
2642 } else {
2643 inc_mm_counter_fast(mm, MM_ANONPAGES);
2644 }
2645 flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
2646 entry = mk_pte(new_page, vma->vm_page_prot);
2647 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2648
2649
2650
2651
2652
2653
2654 ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
2655 page_add_new_anon_rmap(new_page, vma, vmf->address, false);
2656 lru_cache_add_inactive_or_unevictable(new_page, vma);
2657
2658
2659
2660
2661
2662 set_pte_at_notify(mm, vmf->address, vmf->pte, entry);
2663 update_mmu_cache(vma, vmf->address, vmf->pte);
2664 if (old_page) {
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687 page_remove_rmap(old_page, false);
2688 }
2689
2690
2691 new_page = old_page;
2692 page_copied = 1;
2693 }
2694
2695 if (new_page)
2696 put_page(new_page);
2697
2698 pte_unmap_unlock(vmf->pte, vmf->ptl);
2699
2700
2701
2702
2703 mmu_notifier_invalidate_range_only_end(&range);
2704 if (old_page) {
2705
2706
2707
2708
2709 if (page_copied && (vma->vm_flags & VM_LOCKED)) {
2710 lock_page(old_page);
2711 if (PageMlocked(old_page))
2712 munlock_vma_page(old_page);
2713 unlock_page(old_page);
2714 }
2715 put_page(old_page);
2716 }
2717 return page_copied ? VM_FAULT_WRITE : 0;
2718oom_free_new:
2719 put_page(new_page);
2720oom:
2721 if (old_page)
2722 put_page(old_page);
2723 return VM_FAULT_OOM;
2724}
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf)
2743{
2744 WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
2745 vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
2746 &vmf->ptl);
2747
2748
2749
2750
2751 if (!pte_same(*vmf->pte, vmf->orig_pte)) {
2752 pte_unmap_unlock(vmf->pte, vmf->ptl);
2753 return VM_FAULT_NOPAGE;
2754 }
2755 wp_page_reuse(vmf);
2756 return 0;
2757}
2758
2759
2760
2761
2762
2763static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
2764{
2765 struct vm_area_struct *vma = vmf->vma;
2766
2767 if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
2768 vm_fault_t ret;
2769
2770 pte_unmap_unlock(vmf->pte, vmf->ptl);
2771 vmf->flags |= FAULT_FLAG_MKWRITE;
2772 ret = vma->vm_ops->pfn_mkwrite(vmf);
2773 if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
2774 return ret;
2775 return finish_mkwrite_fault(vmf);
2776 }
2777 wp_page_reuse(vmf);
2778 return VM_FAULT_WRITE;
2779}
2780
2781static vm_fault_t wp_page_shared(struct vm_fault *vmf)
2782 __releases(vmf->ptl)
2783{
2784 struct vm_area_struct *vma = vmf->vma;
2785 vm_fault_t ret = VM_FAULT_WRITE;
2786
2787 get_page(vmf->page);
2788
2789 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
2790 vm_fault_t tmp;
2791
2792 pte_unmap_unlock(vmf->pte, vmf->ptl);
2793 tmp = do_page_mkwrite(vmf);
2794 if (unlikely(!tmp || (tmp &
2795 (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
2796 put_page(vmf->page);
2797 return tmp;
2798 }
2799 tmp = finish_mkwrite_fault(vmf);
2800 if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
2801 unlock_page(vmf->page);
2802 put_page(vmf->page);
2803 return tmp;
2804 }
2805 } else {
2806 wp_page_reuse(vmf);
2807 lock_page(vmf->page);
2808 }
2809 ret |= fault_dirty_shared_page(vmf);
2810 put_page(vmf->page);
2811
2812 return ret;
2813}
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833static vm_fault_t do_wp_page(struct vm_fault *vmf)
2834 __releases(vmf->ptl)
2835{
2836 struct vm_area_struct *vma = vmf->vma;
2837
2838 if (userfaultfd_pte_wp(vma, *vmf->pte)) {
2839 pte_unmap_unlock(vmf->pte, vmf->ptl);
2840 return handle_userfault(vmf, VM_UFFD_WP);
2841 }
2842
2843
2844
2845
2846
2847 if (unlikely(userfaultfd_wp(vmf->vma) &&
2848 mm_tlb_flush_pending(vmf->vma->vm_mm)))
2849 flush_tlb_page(vmf->vma, vmf->address);
2850
2851 vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
2852 if (!vmf->page) {
2853
2854
2855
2856
2857
2858
2859
2860 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2861 (VM_WRITE|VM_SHARED))
2862 return wp_pfn_shared(vmf);
2863
2864 pte_unmap_unlock(vmf->pte, vmf->ptl);
2865 return wp_page_copy(vmf);
2866 }
2867
2868
2869
2870
2871
2872 if (PageAnon(vmf->page)) {
2873 int total_map_swapcount;
2874 if (PageKsm(vmf->page) && (PageSwapCache(vmf->page) ||
2875 page_count(vmf->page) != 1))
2876 goto copy;
2877 if (!trylock_page(vmf->page)) {
2878 get_page(vmf->page);
2879 pte_unmap_unlock(vmf->pte, vmf->ptl);
2880 lock_page(vmf->page);
2881 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
2882 vmf->address, &vmf->ptl);
2883 if (!pte_same(*vmf->pte, vmf->orig_pte)) {
2884 unlock_page(vmf->page);
2885 pte_unmap_unlock(vmf->pte, vmf->ptl);
2886 put_page(vmf->page);
2887 return 0;
2888 }
2889 put_page(vmf->page);
2890 }
2891 if (PageKsm(vmf->page)) {
2892 bool reused = reuse_ksm_page(vmf->page, vmf->vma,
2893 vmf->address);
2894 unlock_page(vmf->page);
2895 if (!reused)
2896 goto copy;
2897 wp_page_reuse(vmf);
2898 return VM_FAULT_WRITE;
2899 }
2900 if (reuse_swap_page(vmf->page, &total_map_swapcount)) {
2901 if (total_map_swapcount == 1) {
2902
2903
2904
2905
2906
2907
2908
2909 page_move_anon_rmap(vmf->page, vma);
2910 }
2911 unlock_page(vmf->page);
2912 wp_page_reuse(vmf);
2913 return VM_FAULT_WRITE;
2914 }
2915 unlock_page(vmf->page);
2916 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2917 (VM_WRITE|VM_SHARED))) {
2918 return wp_page_shared(vmf);
2919 }
2920copy:
2921
2922
2923
2924 get_page(vmf->page);
2925
2926 pte_unmap_unlock(vmf->pte, vmf->ptl);
2927 return wp_page_copy(vmf);
2928}
2929
2930static void unmap_mapping_range_vma(struct vm_area_struct *vma,
2931 unsigned long start_addr, unsigned long end_addr,
2932 struct zap_details *details)
2933{
2934 zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
2935}
2936
2937static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
2938 struct zap_details *details)
2939{
2940 struct vm_area_struct *vma;
2941 pgoff_t vba, vea, zba, zea;
2942
2943 vma_interval_tree_foreach(vma, root,
2944 details->first_index, details->last_index) {
2945
2946 vba = vma->vm_pgoff;
2947 vea = vba + vma_pages(vma) - 1;
2948 zba = details->first_index;
2949 if (zba < vba)
2950 zba = vba;
2951 zea = details->last_index;
2952 if (zea > vea)
2953 zea = vea;
2954
2955 unmap_mapping_range_vma(vma,
2956 ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
2957 ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
2958 details);
2959 }
2960}
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974void unmap_mapping_pages(struct address_space *mapping, pgoff_t start,
2975 pgoff_t nr, bool even_cows)
2976{
2977 struct zap_details details = { };
2978
2979 details.check_mapping = even_cows ? NULL : mapping;
2980 details.first_index = start;
2981 details.last_index = start + nr - 1;
2982 if (details.last_index < details.first_index)
2983 details.last_index = ULONG_MAX;
2984
2985 i_mmap_lock_write(mapping);
2986 if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
2987 unmap_mapping_range_tree(&mapping->i_mmap, &details);
2988 i_mmap_unlock_write(mapping);
2989}
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008void unmap_mapping_range(struct address_space *mapping,
3009 loff_t const holebegin, loff_t const holelen, int even_cows)
3010{
3011 pgoff_t hba = holebegin >> PAGE_SHIFT;
3012 pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
3013
3014
3015 if (sizeof(holelen) > sizeof(hlen)) {
3016 long long holeend =
3017 (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
3018 if (holeend & ~(long long)ULONG_MAX)
3019 hlen = ULONG_MAX - hba + 1;
3020 }
3021
3022 unmap_mapping_pages(mapping, hba, hlen, even_cows);
3023}
3024EXPORT_SYMBOL(unmap_mapping_range);
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034vm_fault_t do_swap_page(struct vm_fault *vmf)
3035{
3036 struct vm_area_struct *vma = vmf->vma;
3037 struct page *page = NULL, *swapcache;
3038 swp_entry_t entry;
3039 pte_t pte;
3040 int locked;
3041 int exclusive = 0;
3042 vm_fault_t ret = 0;
3043 void *shadow = NULL;
3044
3045 if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
3046 goto out;
3047
3048 entry = pte_to_swp_entry(vmf->orig_pte);
3049 if (unlikely(non_swap_entry(entry))) {
3050 if (is_migration_entry(entry)) {
3051 migration_entry_wait(vma->vm_mm, vmf->pmd,
3052 vmf->address);
3053 } else if (is_device_private_entry(entry)) {
3054 vmf->page = device_private_entry_to_page(entry);
3055 ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
3056 } else if (is_hwpoison_entry(entry)) {
3057 ret = VM_FAULT_HWPOISON;
3058 } else {
3059 print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
3060 ret = VM_FAULT_SIGBUS;
3061 }
3062 goto out;
3063 }
3064
3065
3066 delayacct_set_flag(DELAYACCT_PF_SWAPIN);
3067 page = lookup_swap_cache(entry, vma, vmf->address);
3068 swapcache = page;
3069
3070 if (!page) {
3071 struct swap_info_struct *si = swp_swap_info(entry);
3072
3073 if (si->flags & SWP_SYNCHRONOUS_IO &&
3074 __swap_count(entry) == 1) {
3075
3076 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
3077 vmf->address);
3078 if (page) {
3079 __SetPageLocked(page);
3080 __SetPageSwapBacked(page);
3081
3082 if (mem_cgroup_swapin_charge_page(page,
3083 vma->vm_mm, GFP_KERNEL, entry)) {
3084 ret = VM_FAULT_OOM;
3085 goto out_page;
3086 }
3087 mem_cgroup_swapin_uncharge_swap(entry);
3088
3089 shadow = get_shadow_from_swap_cache(entry);
3090 if (shadow)
3091 workingset_refault(page, shadow);
3092
3093 lru_cache_add(page);
3094
3095
3096 set_page_private(page, entry.val);
3097 swap_readpage(page, true);
3098 set_page_private(page, 0);
3099 }
3100 } else {
3101 page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
3102 vmf);
3103 swapcache = page;
3104 }
3105
3106 if (!page) {
3107
3108
3109
3110
3111 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
3112 vmf->address, &vmf->ptl);
3113 if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
3114 ret = VM_FAULT_OOM;
3115 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
3116 goto unlock;
3117 }
3118
3119
3120 ret = VM_FAULT_MAJOR;
3121 count_vm_event(PGMAJFAULT);
3122 count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
3123 } else if (PageHWPoison(page)) {
3124
3125
3126
3127
3128 ret = VM_FAULT_HWPOISON;
3129 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
3130 goto out_release;
3131 }
3132
3133 locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
3134
3135 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
3136 if (!locked) {
3137 ret |= VM_FAULT_RETRY;
3138 goto out_release;
3139 }
3140
3141
3142
3143
3144
3145
3146
3147 if (unlikely((!PageSwapCache(page) ||
3148 page_private(page) != entry.val)) && swapcache)
3149 goto out_page;
3150
3151 page = ksm_might_need_to_copy(page, vma, vmf->address);
3152 if (unlikely(!page)) {
3153 ret = VM_FAULT_OOM;
3154 page = swapcache;
3155 goto out_page;
3156 }
3157
3158 cgroup_throttle_swaprate(page, GFP_KERNEL);
3159
3160
3161
3162
3163 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
3164 &vmf->ptl);
3165 if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte)))
3166 goto out_nomap;
3167
3168 if (unlikely(!PageUptodate(page))) {
3169 ret = VM_FAULT_SIGBUS;
3170 goto out_nomap;
3171 }
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
3184 dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
3185 pte = mk_pte(page, vma->vm_page_prot);
3186 if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
3187 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
3188 vmf->flags &= ~FAULT_FLAG_WRITE;
3189 ret |= VM_FAULT_WRITE;
3190 exclusive = RMAP_EXCLUSIVE;
3191 }
3192 flush_icache_page(vma, page);
3193 if (pte_swp_soft_dirty(vmf->orig_pte))
3194 pte = pte_mksoft_dirty(pte);
3195 if (pte_swp_uffd_wp(vmf->orig_pte)) {
3196 pte = pte_mkuffd_wp(pte);
3197 pte = pte_wrprotect(pte);
3198 }
3199 set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
3200 arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
3201 vmf->orig_pte = pte;
3202
3203
3204 if (unlikely(page != swapcache && swapcache)) {
3205 page_add_new_anon_rmap(page, vma, vmf->address, false);
3206 lru_cache_add_inactive_or_unevictable(page, vma);
3207 } else {
3208 do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
3209 }
3210
3211 swap_free(entry);
3212 if (mem_cgroup_swap_full(page) ||
3213 (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
3214 try_to_free_swap(page);
3215 unlock_page(page);
3216 if (page != swapcache && swapcache) {
3217
3218
3219
3220
3221
3222
3223
3224
3225 unlock_page(swapcache);
3226 put_page(swapcache);
3227 }
3228
3229 if (vmf->flags & FAULT_FLAG_WRITE) {
3230 ret |= do_wp_page(vmf);
3231 if (ret & VM_FAULT_ERROR)
3232 ret &= VM_FAULT_ERROR;
3233 goto out;
3234 }
3235
3236
3237 update_mmu_cache(vma, vmf->address, vmf->pte);
3238unlock:
3239 pte_unmap_unlock(vmf->pte, vmf->ptl);
3240out:
3241 return ret;
3242out_nomap:
3243 pte_unmap_unlock(vmf->pte, vmf->ptl);
3244out_page:
3245 unlock_page(page);
3246out_release:
3247 put_page(page);
3248 if (page != swapcache && swapcache) {
3249 unlock_page(swapcache);
3250 put_page(swapcache);
3251 }
3252 return ret;
3253}
3254
3255
3256
3257
3258
3259
3260static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
3261{
3262 struct vm_area_struct *vma = vmf->vma;
3263 struct page *page;
3264 vm_fault_t ret = 0;
3265 pte_t entry;
3266
3267
3268 if (vma->vm_flags & VM_SHARED)
3269 return VM_FAULT_SIGBUS;
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281 if (pte_alloc(vma->vm_mm, vmf->pmd))
3282 return VM_FAULT_OOM;
3283
3284
3285 if (unlikely(pmd_trans_unstable(vmf->pmd)))
3286 return 0;
3287
3288
3289 if (!(vmf->flags & FAULT_FLAG_WRITE) &&
3290 !mm_forbids_zeropage(vma->vm_mm)) {
3291 entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address),
3292 vma->vm_page_prot));
3293 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
3294 vmf->address, &vmf->ptl);
3295 if (!pte_none(*vmf->pte))
3296 goto unlock;
3297 ret = check_stable_address_space(vma->vm_mm);
3298 if (ret)
3299 goto unlock;
3300
3301 if (userfaultfd_missing(vma)) {
3302 pte_unmap_unlock(vmf->pte, vmf->ptl);
3303 return handle_userfault(vmf, VM_UFFD_MISSING);
3304 }
3305 goto setpte;
3306 }
3307
3308
3309 if (unlikely(anon_vma_prepare(vma)))
3310 goto oom;
3311 page = alloc_zeroed_user_highpage_movable(vma, vmf->address);
3312 if (!page)
3313 goto oom;
3314
3315 if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
3316 goto oom_free_page;
3317 cgroup_throttle_swaprate(page, GFP_KERNEL);
3318
3319
3320
3321
3322
3323
3324 __SetPageUptodate(page);
3325
3326 entry = mk_pte(page, vma->vm_page_prot);
3327 if (vma->vm_flags & VM_WRITE)
3328 entry = pte_mkwrite(pte_mkdirty(entry));
3329
3330 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
3331 &vmf->ptl);
3332 if (!pte_none(*vmf->pte))
3333 goto release;
3334
3335 ret = check_stable_address_space(vma->vm_mm);
3336 if (ret)
3337 goto release;
3338
3339
3340 if (userfaultfd_missing(vma)) {
3341 pte_unmap_unlock(vmf->pte, vmf->ptl);
3342 put_page(page);
3343 return handle_userfault(vmf, VM_UFFD_MISSING);
3344 }
3345
3346 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
3347 page_add_new_anon_rmap(page, vma, vmf->address, false);
3348 lru_cache_add_inactive_or_unevictable(page, vma);
3349setpte:
3350 set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
3351
3352
3353 update_mmu_cache(vma, vmf->address, vmf->pte);
3354unlock:
3355 pte_unmap_unlock(vmf->pte, vmf->ptl);
3356 return ret;
3357release:
3358 put_page(page);
3359 goto unlock;
3360oom_free_page:
3361 put_page(page);
3362oom:
3363 return VM_FAULT_OOM;
3364}
3365
3366
3367
3368
3369
3370
3371static vm_fault_t __do_fault(struct vm_fault *vmf)
3372{
3373 struct vm_area_struct *vma = vmf->vma;
3374 vm_fault_t ret;
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391 if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) {
3392 vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
3393 if (!vmf->prealloc_pte)
3394 return VM_FAULT_OOM;
3395 smp_wmb();
3396 }
3397
3398 ret = vma->vm_ops->fault(vmf);
3399 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
3400 VM_FAULT_DONE_COW)))
3401 return ret;
3402
3403 if (unlikely(PageHWPoison(vmf->page))) {
3404 if (ret & VM_FAULT_LOCKED)
3405 unlock_page(vmf->page);
3406 put_page(vmf->page);
3407 vmf->page = NULL;
3408 return VM_FAULT_HWPOISON;
3409 }
3410
3411 if (unlikely(!(ret & VM_FAULT_LOCKED)))
3412 lock_page(vmf->page);
3413 else
3414 VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page);
3415
3416 return ret;
3417}
3418
3419
3420
3421
3422
3423
3424
3425static int pmd_devmap_trans_unstable(pmd_t *pmd)
3426{
3427 return pmd_devmap(*pmd) || pmd_trans_unstable(pmd);
3428}
3429
3430static vm_fault_t pte_alloc_one_map(struct vm_fault *vmf)
3431{
3432 struct vm_area_struct *vma = vmf->vma;
3433
3434 if (!pmd_none(*vmf->pmd))
3435 goto map_pte;
3436 if (vmf->prealloc_pte) {
3437 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
3438 if (unlikely(!pmd_none(*vmf->pmd))) {
3439 spin_unlock(vmf->ptl);
3440 goto map_pte;
3441 }
3442
3443 mm_inc_nr_ptes(vma->vm_mm);
3444 pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
3445 spin_unlock(vmf->ptl);
3446 vmf->prealloc_pte = NULL;
3447 } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) {
3448 return VM_FAULT_OOM;
3449 }
3450map_pte:
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462 if (pmd_devmap_trans_unstable(vmf->pmd))
3463 return VM_FAULT_NOPAGE;
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
3475 &vmf->ptl);
3476 return 0;
3477}
3478
3479#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3480static void deposit_prealloc_pte(struct vm_fault *vmf)
3481{
3482 struct vm_area_struct *vma = vmf->vma;
3483
3484 pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
3485
3486
3487
3488
3489 mm_inc_nr_ptes(vma->vm_mm);
3490 vmf->prealloc_pte = NULL;
3491}
3492
3493static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
3494{
3495 struct vm_area_struct *vma = vmf->vma;
3496 bool write = vmf->flags & FAULT_FLAG_WRITE;
3497 unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
3498 pmd_t entry;
3499 int i;
3500 vm_fault_t ret = VM_FAULT_FALLBACK;
3501
3502 if (!transhuge_vma_suitable(vma, haddr))
3503 return ret;
3504
3505 page = compound_head(page);
3506 if (compound_order(page) != HPAGE_PMD_ORDER)
3507 return ret;
3508
3509
3510
3511
3512
3513 if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
3514 vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
3515 if (!vmf->prealloc_pte)
3516 return VM_FAULT_OOM;
3517 smp_wmb();
3518 }
3519
3520 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
3521 if (unlikely(!pmd_none(*vmf->pmd)))
3522 goto out;
3523
3524 for (i = 0; i < HPAGE_PMD_NR; i++)
3525 flush_icache_page(vma, page + i);
3526
3527 entry = mk_huge_pmd(page, vma->vm_page_prot);
3528 if (write)
3529 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
3530
3531 add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR);
3532 page_add_file_rmap(page, true);
3533
3534
3535
3536 if (arch_needs_pgtable_deposit())
3537 deposit_prealloc_pte(vmf);
3538
3539 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
3540
3541 update_mmu_cache_pmd(vma, haddr, vmf->pmd);
3542
3543
3544 ret = 0;
3545 count_vm_event(THP_FILE_MAPPED);
3546out:
3547 spin_unlock(vmf->ptl);
3548 return ret;
3549}
3550#else
3551static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
3552{
3553 BUILD_BUG();
3554 return 0;
3555}
3556#endif
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page)
3574{
3575 struct vm_area_struct *vma = vmf->vma;
3576 bool write = vmf->flags & FAULT_FLAG_WRITE;
3577 pte_t entry;
3578 vm_fault_t ret;
3579
3580 if (pmd_none(*vmf->pmd) && PageTransCompound(page)) {
3581 ret = do_set_pmd(vmf, page);
3582 if (ret != VM_FAULT_FALLBACK)
3583 return ret;
3584 }
3585
3586 if (!vmf->pte) {
3587 ret = pte_alloc_one_map(vmf);
3588 if (ret)
3589 return ret;
3590 }
3591
3592
3593 if (unlikely(!pte_none(*vmf->pte)))
3594 return VM_FAULT_NOPAGE;
3595
3596 flush_icache_page(vma, page);
3597 entry = mk_pte(page, vma->vm_page_prot);
3598 if (write)
3599 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
3600
3601 if (write && !(vma->vm_flags & VM_SHARED)) {
3602 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
3603 page_add_new_anon_rmap(page, vma, vmf->address, false);
3604 lru_cache_add_inactive_or_unevictable(page, vma);
3605 } else {
3606 inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
3607 page_add_file_rmap(page, false);
3608 }
3609 set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
3610
3611
3612 update_mmu_cache(vma, vmf->address, vmf->pte);
3613
3614 return 0;
3615}
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633vm_fault_t finish_fault(struct vm_fault *vmf)
3634{
3635 struct page *page;
3636 vm_fault_t ret = 0;
3637
3638
3639 if ((vmf->flags & FAULT_FLAG_WRITE) &&
3640 !(vmf->vma->vm_flags & VM_SHARED))
3641 page = vmf->cow_page;
3642 else
3643 page = vmf->page;
3644
3645
3646
3647
3648
3649 if (!(vmf->vma->vm_flags & VM_SHARED))
3650 ret = check_stable_address_space(vmf->vma->vm_mm);
3651 if (!ret)
3652 ret = alloc_set_pte(vmf, page);
3653 if (vmf->pte)
3654 pte_unmap_unlock(vmf->pte, vmf->ptl);
3655 return ret;
3656}
3657
3658static unsigned long fault_around_bytes __read_mostly =
3659 rounddown_pow_of_two(65536);
3660
3661#ifdef CONFIG_DEBUG_FS
3662static int fault_around_bytes_get(void *data, u64 *val)
3663{
3664 *val = fault_around_bytes;
3665 return 0;
3666}
3667
3668
3669
3670
3671
3672static int fault_around_bytes_set(void *data, u64 val)
3673{
3674 if (val / PAGE_SIZE > PTRS_PER_PTE)
3675 return -EINVAL;
3676 if (val > PAGE_SIZE)
3677 fault_around_bytes = rounddown_pow_of_two(val);
3678 else
3679 fault_around_bytes = PAGE_SIZE;
3680 return 0;
3681}
3682DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops,
3683 fault_around_bytes_get, fault_around_bytes_set, "%llu\n");
3684
3685static int __init fault_around_debugfs(void)
3686{
3687 debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL,
3688 &fault_around_bytes_fops);
3689 return 0;
3690}
3691late_initcall(fault_around_debugfs);
3692#endif
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718static vm_fault_t do_fault_around(struct vm_fault *vmf)
3719{
3720 unsigned long address = vmf->address, nr_pages, mask;
3721 pgoff_t start_pgoff = vmf->pgoff;
3722 pgoff_t end_pgoff;
3723 int off;
3724 vm_fault_t ret = 0;
3725
3726 nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
3727 mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
3728
3729 vmf->address = max(address & mask, vmf->vma->vm_start);
3730 off = ((address - vmf->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
3731 start_pgoff -= off;
3732
3733
3734
3735
3736
3737 end_pgoff = start_pgoff -
3738 ((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
3739 PTRS_PER_PTE - 1;
3740 end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1,
3741 start_pgoff + nr_pages - 1);
3742
3743 if (pmd_none(*vmf->pmd)) {
3744 vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
3745 if (!vmf->prealloc_pte)
3746 goto out;
3747 smp_wmb();
3748 }
3749
3750 vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);
3751
3752
3753 if (pmd_trans_huge(*vmf->pmd)) {
3754 ret = VM_FAULT_NOPAGE;
3755 goto out;
3756 }
3757
3758
3759 if (!vmf->pte)
3760 goto out;
3761
3762
3763 vmf->pte -= (vmf->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT);
3764 if (!pte_none(*vmf->pte))
3765 ret = VM_FAULT_NOPAGE;
3766 pte_unmap_unlock(vmf->pte, vmf->ptl);
3767out:
3768 vmf->address = address;
3769 vmf->pte = NULL;
3770 return ret;
3771}
3772
3773static vm_fault_t do_read_fault(struct vm_fault *vmf)
3774{
3775 struct vm_area_struct *vma = vmf->vma;
3776 vm_fault_t ret = 0;
3777
3778
3779
3780
3781
3782
3783 if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
3784 ret = do_fault_around(vmf);
3785 if (ret)
3786 return ret;
3787 }
3788
3789 ret = __do_fault(vmf);
3790 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3791 return ret;
3792
3793 ret |= finish_fault(vmf);
3794 unlock_page(vmf->page);
3795 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3796 put_page(vmf->page);
3797 return ret;
3798}
3799
3800static vm_fault_t do_cow_fault(struct vm_fault *vmf)
3801{
3802 struct vm_area_struct *vma = vmf->vma;
3803 vm_fault_t ret;
3804
3805 if (unlikely(anon_vma_prepare(vma)))
3806 return VM_FAULT_OOM;
3807
3808 vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address);
3809 if (!vmf->cow_page)
3810 return VM_FAULT_OOM;
3811
3812 if (mem_cgroup_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL)) {
3813 put_page(vmf->cow_page);
3814 return VM_FAULT_OOM;
3815 }
3816 cgroup_throttle_swaprate(vmf->cow_page, GFP_KERNEL);
3817
3818 ret = __do_fault(vmf);
3819 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3820 goto uncharge_out;
3821 if (ret & VM_FAULT_DONE_COW)
3822 return ret;
3823
3824 copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma);
3825 __SetPageUptodate(vmf->cow_page);
3826
3827 ret |= finish_fault(vmf);
3828 unlock_page(vmf->page);
3829 put_page(vmf->page);
3830 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3831 goto uncharge_out;
3832 return ret;
3833uncharge_out:
3834 put_page(vmf->cow_page);
3835 return ret;
3836}
3837
3838static vm_fault_t do_shared_fault(struct vm_fault *vmf)
3839{
3840 struct vm_area_struct *vma = vmf->vma;
3841 vm_fault_t ret, tmp;
3842
3843 ret = __do_fault(vmf);
3844 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3845 return ret;
3846
3847
3848
3849
3850
3851 if (vma->vm_ops->page_mkwrite) {
3852 unlock_page(vmf->page);
3853 tmp = do_page_mkwrite(vmf);
3854 if (unlikely(!tmp ||
3855 (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
3856 put_page(vmf->page);
3857 return tmp;
3858 }
3859 }
3860
3861 ret |= finish_fault(vmf);
3862 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
3863 VM_FAULT_RETRY))) {
3864 unlock_page(vmf->page);
3865 put_page(vmf->page);
3866 return ret;
3867 }
3868
3869 ret |= fault_dirty_shared_page(vmf);
3870 return ret;
3871}
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881static vm_fault_t do_fault(struct vm_fault *vmf)
3882{
3883 struct vm_area_struct *vma = vmf->vma;
3884 struct mm_struct *vm_mm = vma->vm_mm;
3885 vm_fault_t ret;
3886
3887
3888 if (!vma->vm_ops->fault)
3889 ret = VM_FAULT_SIGBUS;
3890 else if (!(vmf->flags & FAULT_FLAG_WRITE))
3891 ret = do_read_fault(vmf);
3892 else if (!(vma->vm_flags & VM_SHARED))
3893 ret = do_cow_fault(vmf);
3894 else
3895 ret = do_shared_fault(vmf);
3896
3897
3898 if (vmf->prealloc_pte) {
3899 pte_free(vm_mm, vmf->prealloc_pte);
3900 vmf->prealloc_pte = NULL;
3901 }
3902 return ret;
3903}
3904
3905static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
3906 unsigned long addr, int page_nid,
3907 int *flags)
3908{
3909 get_page(page);
3910
3911 count_vm_numa_event(NUMA_HINT_FAULTS);
3912 if (page_nid == numa_node_id()) {
3913 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
3914 *flags |= TNF_FAULT_LOCAL;
3915 }
3916
3917 return mpol_misplaced(page, vma, addr);
3918}
3919
3920static vm_fault_t do_numa_page(struct vm_fault *vmf)
3921{
3922 struct vm_area_struct *vma = vmf->vma;
3923 struct page *page = NULL;
3924 int page_nid = NUMA_NO_NODE;
3925 int last_cpupid;
3926 int target_nid;
3927 bool migrated = false;
3928 pte_t pte, old_pte;
3929 bool was_writable = pte_savedwrite(vmf->orig_pte);
3930 int flags = 0;
3931
3932
3933
3934
3935
3936
3937 vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd);
3938 spin_lock(vmf->ptl);
3939 if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
3940 pte_unmap_unlock(vmf->pte, vmf->ptl);
3941 goto out;
3942 }
3943
3944
3945
3946
3947
3948 old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte);
3949 pte = pte_modify(old_pte, vma->vm_page_prot);
3950 pte = pte_mkyoung(pte);
3951 if (was_writable)
3952 pte = pte_mkwrite(pte);
3953 ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte);
3954 update_mmu_cache(vma, vmf->address, vmf->pte);
3955
3956 page = vm_normal_page(vma, vmf->address, pte);
3957 if (!page) {
3958 pte_unmap_unlock(vmf->pte, vmf->ptl);
3959 return 0;
3960 }
3961
3962
3963 if (PageCompound(page)) {
3964 pte_unmap_unlock(vmf->pte, vmf->ptl);
3965 return 0;
3966 }
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976 if (!pte_write(pte))
3977 flags |= TNF_NO_GROUP;
3978
3979
3980
3981
3982
3983 if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
3984 flags |= TNF_SHARED;
3985
3986 last_cpupid = page_cpupid_last(page);
3987 page_nid = page_to_nid(page);
3988 target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
3989 &flags);
3990 pte_unmap_unlock(vmf->pte, vmf->ptl);
3991 if (target_nid == NUMA_NO_NODE) {
3992 put_page(page);
3993 goto out;
3994 }
3995
3996
3997 migrated = migrate_misplaced_page(page, vma, target_nid);
3998 if (migrated) {
3999 page_nid = target_nid;
4000 flags |= TNF_MIGRATED;
4001 } else
4002 flags |= TNF_MIGRATE_FAIL;
4003
4004out:
4005 if (page_nid != NUMA_NO_NODE)
4006 task_numa_fault(last_cpupid, page_nid, 1, flags);
4007 return 0;
4008}
4009
4010static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
4011{
4012 if (vma_is_anonymous(vmf->vma))
4013 return do_huge_pmd_anonymous_page(vmf);
4014 if (vmf->vma->vm_ops->huge_fault)
4015 return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
4016 return VM_FAULT_FALLBACK;
4017}
4018
4019
4020static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
4021{
4022 if (vma_is_anonymous(vmf->vma)) {
4023 if (userfaultfd_huge_pmd_wp(vmf->vma, orig_pmd))
4024 return handle_userfault(vmf, VM_UFFD_WP);
4025 return do_huge_pmd_wp_page(vmf, orig_pmd);
4026 }
4027 if (vmf->vma->vm_ops->huge_fault) {
4028 vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
4029
4030 if (!(ret & VM_FAULT_FALLBACK))
4031 return ret;
4032 }
4033
4034
4035 __split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);
4036
4037 return VM_FAULT_FALLBACK;
4038}
4039
4040static inline bool vma_is_accessible(struct vm_area_struct *vma)
4041{
4042 return vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE);
4043}
4044
4045static vm_fault_t create_huge_pud(struct vm_fault *vmf)
4046{
4047#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
4048 defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
4049
4050 if (vma_is_anonymous(vmf->vma))
4051 goto split;
4052 if (vmf->vma->vm_ops->huge_fault) {
4053 vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
4054
4055 if (!(ret & VM_FAULT_FALLBACK))
4056 return ret;
4057 }
4058split:
4059
4060 __split_huge_pud(vmf->vma, vmf->pud, vmf->address);
4061#endif
4062 return VM_FAULT_FALLBACK;
4063}
4064
4065static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
4066{
4067#ifdef CONFIG_TRANSPARENT_HUGEPAGE
4068
4069 if (vma_is_anonymous(vmf->vma))
4070 return VM_FAULT_FALLBACK;
4071 if (vmf->vma->vm_ops->huge_fault)
4072 return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
4073#endif
4074 return VM_FAULT_FALLBACK;
4075}
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
4093{
4094 pte_t entry;
4095
4096 if (unlikely(pmd_none(*vmf->pmd))) {
4097
4098
4099
4100
4101
4102
4103 vmf->pte = NULL;
4104 } else {
4105
4106 if (pmd_devmap_trans_unstable(vmf->pmd))
4107 return 0;
4108
4109
4110
4111
4112
4113
4114 vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
4115 vmf->orig_pte = *vmf->pte;
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125 barrier();
4126 if (pte_none(vmf->orig_pte)) {
4127 pte_unmap(vmf->pte);
4128 vmf->pte = NULL;
4129 }
4130 }
4131
4132 if (!vmf->pte) {
4133 if (vma_is_anonymous(vmf->vma))
4134 return do_anonymous_page(vmf);
4135 else
4136 return do_fault(vmf);
4137 }
4138
4139 if (!pte_present(vmf->orig_pte))
4140 return do_swap_page(vmf);
4141
4142 if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
4143 return do_numa_page(vmf);
4144
4145 vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
4146 spin_lock(vmf->ptl);
4147 entry = vmf->orig_pte;
4148 if (unlikely(!pte_same(*vmf->pte, entry)))
4149 goto unlock;
4150 if (vmf->flags & FAULT_FLAG_WRITE) {
4151 if (!pte_write(entry))
4152 return do_wp_page(vmf);
4153 entry = pte_mkdirty(entry);
4154 }
4155 entry = pte_mkyoung(entry);
4156 if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
4157 vmf->flags & FAULT_FLAG_WRITE)) {
4158 update_mmu_cache(vmf->vma, vmf->address, vmf->pte);
4159 } else {
4160
4161 if (vmf->flags & FAULT_FLAG_TRIED)
4162 goto unlock;
4163
4164
4165
4166
4167
4168
4169 if (vmf->flags & FAULT_FLAG_WRITE)
4170 flush_tlb_fix_spurious_fault(vmf->vma, vmf->address);
4171 }
4172unlock:
4173 pte_unmap_unlock(vmf->pte, vmf->ptl);
4174 return 0;
4175}
4176
4177
4178
4179
4180
4181
4182
4183static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
4184 unsigned long address, unsigned int flags)
4185{
4186 struct vm_fault vmf = {
4187 .vma = vma,
4188 .address = address & PAGE_MASK,
4189 .flags = flags,
4190 .pgoff = linear_page_index(vma, address),
4191 .gfp_mask = __get_fault_gfp_mask(vma),
4192 };
4193 unsigned int dirty = flags & FAULT_FLAG_WRITE;
4194 struct mm_struct *mm = vma->vm_mm;
4195 pgd_t *pgd;
4196 p4d_t *p4d;
4197 vm_fault_t ret;
4198
4199 pgd = pgd_offset(mm, address);
4200 p4d = p4d_alloc(mm, pgd, address);
4201 if (!p4d)
4202 return VM_FAULT_OOM;
4203
4204 vmf.pud = pud_alloc(mm, p4d, address);
4205 if (!vmf.pud)
4206 return VM_FAULT_OOM;
4207retry_pud:
4208 if (pud_none(*vmf.pud) && __transparent_hugepage_enabled(vma)) {
4209 ret = create_huge_pud(&vmf);
4210 if (!(ret & VM_FAULT_FALLBACK))
4211 return ret;
4212 } else {
4213 pud_t orig_pud = *vmf.pud;
4214
4215 barrier();
4216 if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {
4217
4218
4219
4220 if (dirty && !pud_write(orig_pud)) {
4221 ret = wp_huge_pud(&vmf, orig_pud);
4222 if (!(ret & VM_FAULT_FALLBACK))
4223 return ret;
4224 } else {
4225 huge_pud_set_accessed(&vmf, orig_pud);
4226 return 0;
4227 }
4228 }
4229 }
4230
4231 vmf.pmd = pmd_alloc(mm, vmf.pud, address);
4232 if (!vmf.pmd)
4233 return VM_FAULT_OOM;
4234
4235
4236 if (pud_trans_unstable(vmf.pud))
4237 goto retry_pud;
4238
4239 if (pmd_none(*vmf.pmd) && __transparent_hugepage_enabled(vma)) {
4240 ret = create_huge_pmd(&vmf);
4241 if (!(ret & VM_FAULT_FALLBACK))
4242 return ret;
4243 } else {
4244 pmd_t orig_pmd = *vmf.pmd;
4245
4246 barrier();
4247 if (unlikely(is_swap_pmd(orig_pmd))) {
4248 VM_BUG_ON(thp_migration_supported() &&
4249 !is_pmd_migration_entry(orig_pmd));
4250 if (is_pmd_migration_entry(orig_pmd))
4251 pmd_migration_entry_wait(mm, vmf.pmd);
4252 return 0;
4253 }
4254 if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
4255 if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
4256 return do_huge_pmd_numa_page(&vmf, orig_pmd);
4257
4258 if (dirty && !pmd_write(orig_pmd)) {
4259 ret = wp_huge_pmd(&vmf, orig_pmd);
4260 if (!(ret & VM_FAULT_FALLBACK))
4261 return ret;
4262 } else {
4263 huge_pmd_set_accessed(&vmf, orig_pmd);
4264 return 0;
4265 }
4266 }
4267 }
4268
4269 return handle_pte_fault(&vmf);
4270}
4271
4272
4273
4274
4275
4276
4277
4278vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
4279 unsigned int flags)
4280{
4281 vm_fault_t ret;
4282
4283 __set_current_state(TASK_RUNNING);
4284
4285 count_vm_event(PGFAULT);
4286 count_memcg_event_mm(vma->vm_mm, PGFAULT);
4287
4288
4289 check_sync_rss_stat(current);
4290
4291 if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
4292 flags & FAULT_FLAG_INSTRUCTION,
4293 flags & FAULT_FLAG_REMOTE))
4294 return VM_FAULT_SIGSEGV;
4295
4296
4297
4298
4299
4300 if (flags & FAULT_FLAG_USER)
4301 mem_cgroup_enter_user_fault();
4302
4303 if (unlikely(is_vm_hugetlb_page(vma)))
4304 ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
4305 else
4306 ret = __handle_mm_fault(vma, address, flags);
4307
4308 if (flags & FAULT_FLAG_USER) {
4309 mem_cgroup_exit_user_fault();
4310
4311
4312
4313
4314
4315
4316 if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
4317 mem_cgroup_oom_synchronize(false);
4318 }
4319
4320 return ret;
4321}
4322EXPORT_SYMBOL_GPL(handle_mm_fault);
4323
4324#ifndef __PAGETABLE_P4D_FOLDED
4325
4326
4327
4328
4329int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
4330{
4331 p4d_t *new = p4d_alloc_one(mm, address);
4332 if (!new)
4333 return -ENOMEM;
4334
4335 smp_wmb();
4336
4337 spin_lock(&mm->page_table_lock);
4338 if (pgd_present(*pgd))
4339 p4d_free(mm, new);
4340 else
4341 pgd_populate(mm, pgd, new);
4342 spin_unlock(&mm->page_table_lock);
4343 return 0;
4344}
4345#endif
4346
4347#ifndef __PAGETABLE_PUD_FOLDED
4348
4349
4350
4351
4352int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address)
4353{
4354 pud_t *new = pud_alloc_one(mm, address);
4355 if (!new)
4356 return -ENOMEM;
4357
4358 smp_wmb();
4359
4360 spin_lock(&mm->page_table_lock);
4361#ifndef __ARCH_HAS_5LEVEL_HACK
4362 if (!p4d_present(*p4d)) {
4363 mm_inc_nr_puds(mm);
4364 p4d_populate(mm, p4d, new);
4365 } else
4366 pud_free(mm, new);
4367#else
4368 if (!pgd_present(*p4d)) {
4369 mm_inc_nr_puds(mm);
4370 pgd_populate(mm, p4d, new);
4371 } else
4372 pud_free(mm, new);
4373#endif
4374 spin_unlock(&mm->page_table_lock);
4375 return 0;
4376}
4377#endif
4378
4379#ifndef __PAGETABLE_PMD_FOLDED
4380
4381
4382
4383
4384int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
4385{
4386 spinlock_t *ptl;
4387 pmd_t *new = pmd_alloc_one(mm, address);
4388 if (!new)
4389 return -ENOMEM;
4390
4391 smp_wmb();
4392
4393 ptl = pud_lock(mm, pud);
4394#ifndef __ARCH_HAS_4LEVEL_HACK
4395 if (!pud_present(*pud)) {
4396 mm_inc_nr_pmds(mm);
4397 pud_populate(mm, pud, new);
4398 } else
4399 pmd_free(mm, new);
4400#else
4401 if (!pgd_present(*pud)) {
4402 mm_inc_nr_pmds(mm);
4403 pgd_populate(mm, pud, new);
4404 } else
4405 pmd_free(mm, new);
4406#endif
4407 spin_unlock(ptl);
4408 return 0;
4409}
4410#endif
4411
4412static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
4413 struct mmu_notifier_range *range,
4414 pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
4415{
4416 pgd_t *pgd;
4417 p4d_t *p4d;
4418 pud_t *pud;
4419 pmd_t *pmd;
4420 pte_t *ptep;
4421
4422 pgd = pgd_offset(mm, address);
4423 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
4424 goto out;
4425
4426 p4d = p4d_offset(pgd, address);
4427 if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d)))
4428 goto out;
4429
4430 pud = pud_offset(p4d, address);
4431 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
4432 goto out;
4433
4434 pmd = pmd_offset(pud, address);
4435 VM_BUG_ON(pmd_trans_huge(*pmd));
4436
4437 if (pmd_huge(*pmd)) {
4438 if (!pmdpp)
4439 goto out;
4440
4441 if (range) {
4442 mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0,
4443 NULL, mm, address & PMD_MASK,
4444 (address & PMD_MASK) + PMD_SIZE);
4445 mmu_notifier_invalidate_range_start(range);
4446 }
4447 *ptlp = pmd_lock(mm, pmd);
4448 if (pmd_huge(*pmd)) {
4449 *pmdpp = pmd;
4450 return 0;
4451 }
4452 spin_unlock(*ptlp);
4453 if (range)
4454 mmu_notifier_invalidate_range_end(range);
4455 }
4456
4457 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
4458 goto out;
4459
4460 if (range) {
4461 mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
4462 address & PAGE_MASK,
4463 (address & PAGE_MASK) + PAGE_SIZE);
4464 mmu_notifier_invalidate_range_start(range);
4465 }
4466 ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
4467 if (!pte_present(*ptep))
4468 goto unlock;
4469 *ptepp = ptep;
4470 return 0;
4471unlock:
4472 pte_unmap_unlock(ptep, *ptlp);
4473 if (range)
4474 mmu_notifier_invalidate_range_end(range);
4475out:
4476 return -EINVAL;
4477}
4478
4479static inline int follow_pte(struct mm_struct *mm, unsigned long address,
4480 pte_t **ptepp, spinlock_t **ptlp)
4481{
4482 int res;
4483
4484
4485 (void) __cond_lock(*ptlp,
4486 !(res = __follow_pte_pmd(mm, address, NULL,
4487 ptepp, NULL, ptlp)));
4488 return res;
4489}
4490
4491int follow_pte_pmd(struct mm_struct *mm, unsigned long address,
4492 struct mmu_notifier_range *range,
4493 pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
4494{
4495 int res;
4496
4497
4498 (void) __cond_lock(*ptlp,
4499 !(res = __follow_pte_pmd(mm, address, range,
4500 ptepp, pmdpp, ptlp)));
4501 return res;
4502}
4503EXPORT_SYMBOL(follow_pte_pmd);
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515int follow_pfn(struct vm_area_struct *vma, unsigned long address,
4516 unsigned long *pfn)
4517{
4518 int ret = -EINVAL;
4519 spinlock_t *ptl;
4520 pte_t *ptep;
4521
4522 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
4523 return ret;
4524
4525 ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
4526 if (ret)
4527 return ret;
4528 *pfn = pte_pfn(*ptep);
4529 pte_unmap_unlock(ptep, ptl);
4530 return 0;
4531}
4532EXPORT_SYMBOL(follow_pfn);
4533
4534#ifdef CONFIG_HAVE_IOREMAP_PROT
4535int follow_phys(struct vm_area_struct *vma,
4536 unsigned long address, unsigned int flags,
4537 unsigned long *prot, resource_size_t *phys)
4538{
4539 int ret = -EINVAL;
4540 pte_t *ptep, pte;
4541 spinlock_t *ptl;
4542
4543 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
4544 goto out;
4545
4546 if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
4547 goto out;
4548 pte = *ptep;
4549
4550 if ((flags & FOLL_WRITE) && !pte_write(pte))
4551 goto unlock;
4552
4553 *prot = pgprot_val(pte_pgprot(pte));
4554 *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
4555
4556 ret = 0;
4557unlock:
4558 pte_unmap_unlock(ptep, ptl);
4559out:
4560 return ret;
4561}
4562
4563int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
4564 void *buf, int len, int write)
4565{
4566 resource_size_t phys_addr;
4567 unsigned long prot = 0;
4568 void __iomem *maddr;
4569 int offset = addr & (PAGE_SIZE-1);
4570
4571 if (follow_phys(vma, addr, write, &prot, &phys_addr))
4572 return -EINVAL;
4573
4574 maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
4575 if (!maddr)
4576 return -ENOMEM;
4577
4578 if (write)
4579 memcpy_toio(maddr + offset, buf, len);
4580 else
4581 memcpy_fromio(buf, maddr + offset, len);
4582 iounmap(maddr);
4583
4584 return len;
4585}
4586EXPORT_SYMBOL_GPL(generic_access_phys);
4587#endif
4588
4589
4590
4591
4592
4593int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
4594 unsigned long addr, void *buf, int len, unsigned int gup_flags)
4595{
4596 struct vm_area_struct *vma;
4597 void *old_buf = buf;
4598 int write = gup_flags & FOLL_WRITE;
4599
4600 if (mmap_read_lock_killable(mm))
4601 return 0;
4602
4603
4604 while (len) {
4605 int bytes, ret, offset;
4606 void *maddr;
4607 struct page *page = NULL;
4608
4609 ret = get_user_pages_remote(tsk, mm, addr, 1,
4610 gup_flags, &page, &vma, NULL);
4611 if (ret <= 0) {
4612#ifndef CONFIG_HAVE_IOREMAP_PROT
4613 break;
4614#else
4615
4616
4617
4618
4619 vma = vma_lookup(mm, addr);
4620 if (!vma)
4621 break;
4622 if (vma->vm_ops && vma->vm_ops->access)
4623 ret = vma->vm_ops->access(vma, addr, buf,
4624 len, write);
4625 if (ret <= 0)
4626 break;
4627 bytes = ret;
4628#endif
4629 } else {
4630 bytes = len;
4631 offset = addr & (PAGE_SIZE-1);
4632 if (bytes > PAGE_SIZE-offset)
4633 bytes = PAGE_SIZE-offset;
4634
4635 maddr = kmap(page);
4636 if (write) {
4637 copy_to_user_page(vma, page, addr,
4638 maddr + offset, buf, bytes);
4639 set_page_dirty_lock(page);
4640 } else {
4641 copy_from_user_page(vma, page, addr,
4642 buf, maddr + offset, bytes);
4643 }
4644 kunmap(page);
4645 put_page(page);
4646 }
4647 len -= bytes;
4648 buf += bytes;
4649 addr += bytes;
4650 }
4651 mmap_read_unlock(mm);
4652
4653 return buf - old_buf;
4654}
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668int access_remote_vm(struct mm_struct *mm, unsigned long addr,
4669 void *buf, int len, unsigned int gup_flags)
4670{
4671 return __access_remote_vm(NULL, mm, addr, buf, len, gup_flags);
4672}
4673
4674
4675
4676
4677
4678
4679int access_process_vm(struct task_struct *tsk, unsigned long addr,
4680 void *buf, int len, unsigned int gup_flags)
4681{
4682 struct mm_struct *mm;
4683 int ret;
4684
4685 mm = get_task_mm(tsk);
4686 if (!mm)
4687 return 0;
4688
4689 ret = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags);
4690
4691 mmput(mm);
4692
4693 return ret;
4694}
4695EXPORT_SYMBOL_GPL(access_process_vm);
4696
4697
4698
4699
4700void print_vma_addr(char *prefix, unsigned long ip)
4701{
4702 struct mm_struct *mm = current->mm;
4703 struct vm_area_struct *vma;
4704
4705
4706
4707
4708 if (!mmap_read_trylock(mm))
4709 return;
4710
4711 vma = find_vma(mm, ip);
4712 if (vma && vma->vm_file) {
4713 struct file *f = vma->vm_file;
4714 char *buf = (char *)__get_free_page(GFP_NOWAIT);
4715 if (buf) {
4716 char *p;
4717
4718 p = file_path(f, buf, PAGE_SIZE);
4719 if (IS_ERR(p))
4720 p = "?";
4721 printk("%s%s[%lx+%lx]", prefix, kbasename(p),
4722 vma->vm_start,
4723 vma->vm_end - vma->vm_start);
4724 free_page((unsigned long)buf);
4725 }
4726 }
4727 mmap_read_unlock(mm);
4728}
4729
4730#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
4731void __might_fault(const char *file, int line)
4732{
4733
4734
4735
4736
4737
4738
4739 if (uaccess_kernel())
4740 return;
4741 if (pagefault_disabled())
4742 return;
4743 __might_sleep(file, line, 0);
4744#if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
4745 if (current->mm)
4746 might_lock_read(¤t->mm->mmap_lock);
4747#endif
4748}
4749EXPORT_SYMBOL(__might_fault);
4750#endif
4751
4752#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
4753
4754
4755
4756
4757
4758static inline void process_huge_page(
4759 unsigned long addr_hint, unsigned int pages_per_huge_page,
4760 void (*process_subpage)(unsigned long addr, int idx, void *arg),
4761 void *arg)
4762{
4763 int i, n, base, l;
4764 unsigned long addr = addr_hint &
4765 ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
4766
4767
4768 might_sleep();
4769 n = (addr_hint - addr) / PAGE_SIZE;
4770 if (2 * n <= pages_per_huge_page) {
4771
4772 base = 0;
4773 l = n;
4774
4775 for (i = pages_per_huge_page - 1; i >= 2 * n; i--) {
4776 cond_resched();
4777 process_subpage(addr + i * PAGE_SIZE, i, arg);
4778 }
4779 } else {
4780
4781 base = pages_per_huge_page - 2 * (pages_per_huge_page - n);
4782 l = pages_per_huge_page - n;
4783
4784 for (i = 0; i < base; i++) {
4785 cond_resched();
4786 process_subpage(addr + i * PAGE_SIZE, i, arg);
4787 }
4788 }
4789
4790
4791
4792
4793 for (i = 0; i < l; i++) {
4794 int left_idx = base + i;
4795 int right_idx = base + 2 * l - 1 - i;
4796
4797 cond_resched();
4798 process_subpage(addr + left_idx * PAGE_SIZE, left_idx, arg);
4799 cond_resched();
4800 process_subpage(addr + right_idx * PAGE_SIZE, right_idx, arg);
4801 }
4802}
4803
4804static void clear_gigantic_page(struct page *page,
4805 unsigned long addr,
4806 unsigned int pages_per_huge_page)
4807{
4808 int i;
4809 struct page *p = page;
4810
4811 might_sleep();
4812 for (i = 0; i < pages_per_huge_page;
4813 i++, p = mem_map_next(p, page, i)) {
4814 cond_resched();
4815 clear_user_highpage(p, addr + i * PAGE_SIZE);
4816 }
4817}
4818
4819static void clear_subpage(unsigned long addr, int idx, void *arg)
4820{
4821 struct page *page = arg;
4822
4823 clear_user_highpage(page + idx, addr);
4824}
4825
4826void clear_huge_page(struct page *page,
4827 unsigned long addr_hint, unsigned int pages_per_huge_page)
4828{
4829 unsigned long addr = addr_hint &
4830 ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
4831
4832 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
4833 clear_gigantic_page(page, addr, pages_per_huge_page);
4834 return;
4835 }
4836
4837 process_huge_page(addr_hint, pages_per_huge_page, clear_subpage, page);
4838}
4839
4840static void copy_user_gigantic_page(struct page *dst, struct page *src,
4841 unsigned long addr,
4842 struct vm_area_struct *vma,
4843 unsigned int pages_per_huge_page)
4844{
4845 int i;
4846 struct page *dst_base = dst;
4847 struct page *src_base = src;
4848
4849 for (i = 0; i < pages_per_huge_page; ) {
4850 cond_resched();
4851 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
4852
4853 i++;
4854 dst = mem_map_next(dst, dst_base, i);
4855 src = mem_map_next(src, src_base, i);
4856 }
4857}
4858
4859struct copy_subpage_arg {
4860 struct page *dst;
4861 struct page *src;
4862 struct vm_area_struct *vma;
4863};
4864
4865static void copy_subpage(unsigned long addr, int idx, void *arg)
4866{
4867 struct copy_subpage_arg *copy_arg = arg;
4868
4869 copy_user_highpage(copy_arg->dst + idx, copy_arg->src + idx,
4870 addr, copy_arg->vma);
4871}
4872
4873void copy_user_huge_page(struct page *dst, struct page *src,
4874 unsigned long addr_hint, struct vm_area_struct *vma,
4875 unsigned int pages_per_huge_page)
4876{
4877 unsigned long addr = addr_hint &
4878 ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
4879 struct copy_subpage_arg arg = {
4880 .dst = dst,
4881 .src = src,
4882 .vma = vma,
4883 };
4884
4885 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
4886 copy_user_gigantic_page(dst, src, addr, vma,
4887 pages_per_huge_page);
4888 return;
4889 }
4890
4891 process_huge_page(addr_hint, pages_per_huge_page, copy_subpage, &arg);
4892}
4893
4894long copy_huge_page_from_user(struct page *dst_page,
4895 const void __user *usr_src,
4896 unsigned int pages_per_huge_page,
4897 bool allow_pagefault)
4898{
4899 void *src = (void *)usr_src;
4900 void *page_kaddr;
4901 unsigned long i, rc = 0;
4902 unsigned long ret_val = pages_per_huge_page * PAGE_SIZE;
4903 struct page *subpage = dst_page;
4904
4905 for (i = 0; i < pages_per_huge_page;
4906 i++, subpage = mem_map_next(subpage, dst_page, i)) {
4907 if (allow_pagefault)
4908 page_kaddr = kmap(subpage);
4909 else
4910 page_kaddr = kmap_atomic(subpage);
4911 rc = copy_from_user(page_kaddr,
4912 (const void __user *)(src + i * PAGE_SIZE),
4913 PAGE_SIZE);
4914 if (allow_pagefault)
4915 kunmap(subpage);
4916 else
4917 kunmap_atomic(page_kaddr);
4918
4919 ret_val -= (PAGE_SIZE - rc);
4920 if (rc)
4921 break;
4922
4923 cond_resched();
4924 }
4925 return ret_val;
4926}
4927#endif
4928
4929#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS
4930
4931static struct kmem_cache *page_ptl_cachep;
4932
4933void __init ptlock_cache_init(void)
4934{
4935 page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0,
4936 SLAB_PANIC, NULL);
4937}
4938
4939bool ptlock_alloc(struct page *page)
4940{
4941 spinlock_t *ptl;
4942
4943 ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
4944 if (!ptl)
4945 return false;
4946 page->ptl = ptl;
4947 return true;
4948}
4949
4950void ptlock_free(struct page *page)
4951{
4952 kmem_cache_free(page_ptl_cachep, page->ptl);
4953}
4954#endif
4955