1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42#include <linux/kernel_stat.h>
43#include <linux/mm.h>
44#include <linux/sched/mm.h>
45#include <linux/sched/coredump.h>
46#include <linux/sched/numa_balancing.h>
47#include <linux/sched/task.h>
48#include <linux/hugetlb.h>
49#include <linux/mman.h>
50#include <linux/swap.h>
51#include <linux/highmem.h>
52#include <linux/pagemap.h>
53#include <linux/memremap.h>
54#include <linux/ksm.h>
55#include <linux/rmap.h>
56#include <linux/export.h>
57#include <linux/delayacct.h>
58#include <linux/init.h>
59#include <linux/pfn_t.h>
60#include <linux/writeback.h>
61#include <linux/memcontrol.h>
62#include <linux/mmu_notifier.h>
63#include <linux/swapops.h>
64#include <linux/elf.h>
65#include <linux/gfp.h>
66#include <linux/migrate.h>
67#include <linux/string.h>
68#include <linux/debugfs.h>
69#include <linux/userfaultfd_k.h>
70#include <linux/dax.h>
71#include <linux/oom.h>
72#include <linux/numa.h>
73#include <linux/perf_event.h>
74#include <linux/ptrace.h>
75#include <linux/vmalloc.h>
76
77#include <trace/events/kmem.h>
78
79#include <asm/io.h>
80#include <asm/mmu_context.h>
81#include <asm/pgalloc.h>
82#include <linux/uaccess.h>
83#include <asm/tlb.h>
84#include <asm/tlbflush.h>
85
86#include "pgalloc-track.h"
87#include "internal.h"
88
89#if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST)
90#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
91#endif
92
93#ifndef CONFIG_NUMA
94unsigned long max_mapnr;
95EXPORT_SYMBOL(max_mapnr);
96
97struct page *mem_map;
98EXPORT_SYMBOL(mem_map);
99#endif
100
101
102
103
104
105
106
107
108void *high_memory;
109EXPORT_SYMBOL(high_memory);
110
111
112
113
114
115
116
117int randomize_va_space __read_mostly =
118#ifdef CONFIG_COMPAT_BRK
119 1;
120#else
121 2;
122#endif
123
124#ifndef arch_faults_on_old_pte
125static inline bool arch_faults_on_old_pte(void)
126{
127
128
129
130
131
132 return true;
133}
134#endif
135
136#ifndef arch_wants_old_prefaulted_pte
137static inline bool arch_wants_old_prefaulted_pte(void)
138{
139
140
141
142
143
144 return false;
145}
146#endif
147
148static int __init disable_randmaps(char *s)
149{
150 randomize_va_space = 0;
151 return 1;
152}
153__setup("norandmaps", disable_randmaps);
154
155unsigned long zero_pfn __read_mostly;
156EXPORT_SYMBOL(zero_pfn);
157
158unsigned long highest_memmap_pfn __read_mostly;
159
160
161
162
163static int __init init_zero_pfn(void)
164{
165 zero_pfn = page_to_pfn(ZERO_PAGE(0));
166 return 0;
167}
168early_initcall(init_zero_pfn);
169
170void mm_trace_rss_stat(struct mm_struct *mm, int member, long count)
171{
172 trace_rss_stat(mm, member, count);
173}
174
175#if defined(SPLIT_RSS_COUNTING)
176
177void sync_mm_rss(struct mm_struct *mm)
178{
179 int i;
180
181 for (i = 0; i < NR_MM_COUNTERS; i++) {
182 if (current->rss_stat.count[i]) {
183 add_mm_counter(mm, i, current->rss_stat.count[i]);
184 current->rss_stat.count[i] = 0;
185 }
186 }
187 current->rss_stat.events = 0;
188}
189
190static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
191{
192 struct task_struct *task = current;
193
194 if (likely(task->mm == mm))
195 task->rss_stat.count[member] += val;
196 else
197 add_mm_counter(mm, member, val);
198}
199#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
200#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
201
202
203#define TASK_RSS_EVENTS_THRESH (64)
204static void check_sync_rss_stat(struct task_struct *task)
205{
206 if (unlikely(task != current))
207 return;
208 if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
209 sync_mm_rss(task->mm);
210}
211#else
212
213#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
214#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
215
216static void check_sync_rss_stat(struct task_struct *task)
217{
218}
219
220#endif
221
222
223
224
225
226static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
227 unsigned long addr)
228{
229 pgtable_t token = pmd_pgtable(*pmd);
230 pmd_clear(pmd);
231 pte_free_tlb(tlb, token, addr);
232 mm_dec_nr_ptes(tlb->mm);
233}
234
235static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
236 unsigned long addr, unsigned long end,
237 unsigned long floor, unsigned long ceiling)
238{
239 pmd_t *pmd;
240 unsigned long next;
241 unsigned long start;
242
243 start = addr;
244 pmd = pmd_offset(pud, addr);
245 do {
246 next = pmd_addr_end(addr, end);
247 if (pmd_none_or_clear_bad(pmd))
248 continue;
249 free_pte_range(tlb, pmd, addr);
250 } while (pmd++, addr = next, addr != end);
251
252 start &= PUD_MASK;
253 if (start < floor)
254 return;
255 if (ceiling) {
256 ceiling &= PUD_MASK;
257 if (!ceiling)
258 return;
259 }
260 if (end - 1 > ceiling - 1)
261 return;
262
263 pmd = pmd_offset(pud, start);
264 pud_clear(pud);
265 pmd_free_tlb(tlb, pmd, start);
266 mm_dec_nr_pmds(tlb->mm);
267}
268
269static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
270 unsigned long addr, unsigned long end,
271 unsigned long floor, unsigned long ceiling)
272{
273 pud_t *pud;
274 unsigned long next;
275 unsigned long start;
276
277 start = addr;
278 pud = pud_offset(p4d, addr);
279 do {
280 next = pud_addr_end(addr, end);
281 if (pud_none_or_clear_bad(pud))
282 continue;
283 free_pmd_range(tlb, pud, addr, next, floor, ceiling);
284 } while (pud++, addr = next, addr != end);
285
286 start &= P4D_MASK;
287 if (start < floor)
288 return;
289 if (ceiling) {
290 ceiling &= P4D_MASK;
291 if (!ceiling)
292 return;
293 }
294 if (end - 1 > ceiling - 1)
295 return;
296
297 pud = pud_offset(p4d, start);
298 p4d_clear(p4d);
299 pud_free_tlb(tlb, pud, start);
300 mm_dec_nr_puds(tlb->mm);
301}
302
303static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
304 unsigned long addr, unsigned long end,
305 unsigned long floor, unsigned long ceiling)
306{
307 p4d_t *p4d;
308 unsigned long next;
309 unsigned long start;
310
311 start = addr;
312 p4d = p4d_offset(pgd, addr);
313 do {
314 next = p4d_addr_end(addr, end);
315 if (p4d_none_or_clear_bad(p4d))
316 continue;
317 free_pud_range(tlb, p4d, addr, next, floor, ceiling);
318 } while (p4d++, addr = next, addr != end);
319
320 start &= PGDIR_MASK;
321 if (start < floor)
322 return;
323 if (ceiling) {
324 ceiling &= PGDIR_MASK;
325 if (!ceiling)
326 return;
327 }
328 if (end - 1 > ceiling - 1)
329 return;
330
331 p4d = p4d_offset(pgd, start);
332 pgd_clear(pgd);
333 p4d_free_tlb(tlb, p4d, start);
334}
335
336
337
338
339void free_pgd_range(struct mmu_gather *tlb,
340 unsigned long addr, unsigned long end,
341 unsigned long floor, unsigned long ceiling)
342{
343 pgd_t *pgd;
344 unsigned long next;
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372 addr &= PMD_MASK;
373 if (addr < floor) {
374 addr += PMD_SIZE;
375 if (!addr)
376 return;
377 }
378 if (ceiling) {
379 ceiling &= PMD_MASK;
380 if (!ceiling)
381 return;
382 }
383 if (end - 1 > ceiling - 1)
384 end -= PMD_SIZE;
385 if (addr > end - 1)
386 return;
387
388
389
390
391 tlb_change_page_size(tlb, PAGE_SIZE);
392 pgd = pgd_offset(tlb->mm, addr);
393 do {
394 next = pgd_addr_end(addr, end);
395 if (pgd_none_or_clear_bad(pgd))
396 continue;
397 free_p4d_range(tlb, pgd, addr, next, floor, ceiling);
398 } while (pgd++, addr = next, addr != end);
399}
400
401void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
402 unsigned long floor, unsigned long ceiling)
403{
404 while (vma) {
405 struct vm_area_struct *next = vma->vm_next;
406 unsigned long addr = vma->vm_start;
407
408
409
410
411
412 unlink_anon_vmas(vma);
413 unlink_file_vma(vma);
414
415 if (is_vm_hugetlb_page(vma)) {
416 hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
417 floor, next ? next->vm_start : ceiling);
418 } else {
419
420
421
422 while (next && next->vm_start <= vma->vm_end + PMD_SIZE
423 && !is_vm_hugetlb_page(next)) {
424 vma = next;
425 next = vma->vm_next;
426 unlink_anon_vmas(vma);
427 unlink_file_vma(vma);
428 }
429 free_pgd_range(tlb, addr, vma->vm_end,
430 floor, next ? next->vm_start : ceiling);
431 }
432 vma = next;
433 }
434}
435
436void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte)
437{
438 spinlock_t *ptl = pmd_lock(mm, pmd);
439
440 if (likely(pmd_none(*pmd))) {
441 mm_inc_nr_ptes(mm);
442
443
444
445
446
447
448
449
450
451
452
453
454
455 smp_wmb();
456 pmd_populate(mm, pmd, *pte);
457 *pte = NULL;
458 }
459 spin_unlock(ptl);
460}
461
462int __pte_alloc(struct mm_struct *mm, pmd_t *pmd)
463{
464 pgtable_t new = pte_alloc_one(mm);
465 if (!new)
466 return -ENOMEM;
467
468 pmd_install(mm, pmd, &new);
469 if (new)
470 pte_free(mm, new);
471 return 0;
472}
473
474int __pte_alloc_kernel(pmd_t *pmd)
475{
476 pte_t *new = pte_alloc_one_kernel(&init_mm);
477 if (!new)
478 return -ENOMEM;
479
480 spin_lock(&init_mm.page_table_lock);
481 if (likely(pmd_none(*pmd))) {
482 smp_wmb();
483 pmd_populate_kernel(&init_mm, pmd, new);
484 new = NULL;
485 }
486 spin_unlock(&init_mm.page_table_lock);
487 if (new)
488 pte_free_kernel(&init_mm, new);
489 return 0;
490}
491
492static inline void init_rss_vec(int *rss)
493{
494 memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
495}
496
497static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
498{
499 int i;
500
501 if (current->mm == mm)
502 sync_mm_rss(mm);
503 for (i = 0; i < NR_MM_COUNTERS; i++)
504 if (rss[i])
505 add_mm_counter(mm, i, rss[i]);
506}
507
508
509
510
511
512
513
514
515static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
516 pte_t pte, struct page *page)
517{
518 pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
519 p4d_t *p4d = p4d_offset(pgd, addr);
520 pud_t *pud = pud_offset(p4d, addr);
521 pmd_t *pmd = pmd_offset(pud, addr);
522 struct address_space *mapping;
523 pgoff_t index;
524 static unsigned long resume;
525 static unsigned long nr_shown;
526 static unsigned long nr_unshown;
527
528
529
530
531
532 if (nr_shown == 60) {
533 if (time_before(jiffies, resume)) {
534 nr_unshown++;
535 return;
536 }
537 if (nr_unshown) {
538 pr_alert("BUG: Bad page map: %lu messages suppressed\n",
539 nr_unshown);
540 nr_unshown = 0;
541 }
542 nr_shown = 0;
543 }
544 if (nr_shown++ == 0)
545 resume = jiffies + 60 * HZ;
546
547 mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
548 index = linear_page_index(vma, addr);
549
550 pr_alert("BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
551 current->comm,
552 (long long)pte_val(pte), (long long)pmd_val(*pmd));
553 if (page)
554 dump_page(page, "bad pte");
555 pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n",
556 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
557 pr_alert("file:%pD fault:%ps mmap:%ps readpage:%ps\n",
558 vma->vm_file,
559 vma->vm_ops ? vma->vm_ops->fault : NULL,
560 vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
561 mapping ? mapping->a_ops->readpage : NULL);
562 dump_stack();
563 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
564}
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
609 pte_t pte)
610{
611 unsigned long pfn = pte_pfn(pte);
612
613 if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) {
614 if (likely(!pte_special(pte)))
615 goto check_pfn;
616 if (vma->vm_ops && vma->vm_ops->find_special_page)
617 return vma->vm_ops->find_special_page(vma, addr);
618 if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
619 return NULL;
620 if (is_zero_pfn(pfn))
621 return NULL;
622 if (pte_devmap(pte))
623 return NULL;
624
625 print_bad_pte(vma, addr, pte, NULL);
626 return NULL;
627 }
628
629
630
631 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
632 if (vma->vm_flags & VM_MIXEDMAP) {
633 if (!pfn_valid(pfn))
634 return NULL;
635 goto out;
636 } else {
637 unsigned long off;
638 off = (addr - vma->vm_start) >> PAGE_SHIFT;
639 if (pfn == vma->vm_pgoff + off)
640 return NULL;
641 if (!is_cow_mapping(vma->vm_flags))
642 return NULL;
643 }
644 }
645
646 if (is_zero_pfn(pfn))
647 return NULL;
648
649check_pfn:
650 if (unlikely(pfn > highest_memmap_pfn)) {
651 print_bad_pte(vma, addr, pte, NULL);
652 return NULL;
653 }
654
655
656
657
658
659out:
660 return pfn_to_page(pfn);
661}
662
663#ifdef CONFIG_TRANSPARENT_HUGEPAGE
664struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
665 pmd_t pmd)
666{
667 unsigned long pfn = pmd_pfn(pmd);
668
669
670
671
672
673
674 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
675 if (vma->vm_flags & VM_MIXEDMAP) {
676 if (!pfn_valid(pfn))
677 return NULL;
678 goto out;
679 } else {
680 unsigned long off;
681 off = (addr - vma->vm_start) >> PAGE_SHIFT;
682 if (pfn == vma->vm_pgoff + off)
683 return NULL;
684 if (!is_cow_mapping(vma->vm_flags))
685 return NULL;
686 }
687 }
688
689 if (pmd_devmap(pmd))
690 return NULL;
691 if (is_huge_zero_pmd(pmd))
692 return NULL;
693 if (unlikely(pfn > highest_memmap_pfn))
694 return NULL;
695
696
697
698
699
700out:
701 return pfn_to_page(pfn);
702}
703#endif
704
705static void restore_exclusive_pte(struct vm_area_struct *vma,
706 struct page *page, unsigned long address,
707 pte_t *ptep)
708{
709 pte_t pte;
710 swp_entry_t entry;
711
712 pte = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
713 if (pte_swp_soft_dirty(*ptep))
714 pte = pte_mksoft_dirty(pte);
715
716 entry = pte_to_swp_entry(*ptep);
717 if (pte_swp_uffd_wp(*ptep))
718 pte = pte_mkuffd_wp(pte);
719 else if (is_writable_device_exclusive_entry(entry))
720 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
721
722 set_pte_at(vma->vm_mm, address, ptep, pte);
723
724
725
726
727
728 if (PageAnon(page))
729 page_add_anon_rmap(page, vma, address, false);
730 else
731
732
733
734
735 WARN_ON_ONCE(!PageAnon(page));
736
737 if (vma->vm_flags & VM_LOCKED)
738 mlock_vma_page(page);
739
740
741
742
743
744 update_mmu_cache(vma, address, ptep);
745}
746
747
748
749
750
751static int
752try_restore_exclusive_pte(pte_t *src_pte, struct vm_area_struct *vma,
753 unsigned long addr)
754{
755 swp_entry_t entry = pte_to_swp_entry(*src_pte);
756 struct page *page = pfn_swap_entry_to_page(entry);
757
758 if (trylock_page(page)) {
759 restore_exclusive_pte(vma, page, addr, src_pte);
760 unlock_page(page);
761 return 0;
762 }
763
764 return -EBUSY;
765}
766
767
768
769
770
771
772
773static unsigned long
774copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
775 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *dst_vma,
776 struct vm_area_struct *src_vma, unsigned long addr, int *rss)
777{
778 unsigned long vm_flags = dst_vma->vm_flags;
779 pte_t pte = *src_pte;
780 struct page *page;
781 swp_entry_t entry = pte_to_swp_entry(pte);
782
783 if (likely(!non_swap_entry(entry))) {
784 if (swap_duplicate(entry) < 0)
785 return -EIO;
786
787
788 if (unlikely(list_empty(&dst_mm->mmlist))) {
789 spin_lock(&mmlist_lock);
790 if (list_empty(&dst_mm->mmlist))
791 list_add(&dst_mm->mmlist,
792 &src_mm->mmlist);
793 spin_unlock(&mmlist_lock);
794 }
795 rss[MM_SWAPENTS]++;
796 } else if (is_migration_entry(entry)) {
797 page = pfn_swap_entry_to_page(entry);
798
799 rss[mm_counter(page)]++;
800
801 if (is_writable_migration_entry(entry) &&
802 is_cow_mapping(vm_flags)) {
803
804
805
806
807 entry = make_readable_migration_entry(
808 swp_offset(entry));
809 pte = swp_entry_to_pte(entry);
810 if (pte_swp_soft_dirty(*src_pte))
811 pte = pte_swp_mksoft_dirty(pte);
812 if (pte_swp_uffd_wp(*src_pte))
813 pte = pte_swp_mkuffd_wp(pte);
814 set_pte_at(src_mm, addr, src_pte, pte);
815 }
816 } else if (is_device_private_entry(entry)) {
817 page = pfn_swap_entry_to_page(entry);
818
819
820
821
822
823
824
825
826
827
828 get_page(page);
829 rss[mm_counter(page)]++;
830 page_dup_rmap(page, false);
831
832
833
834
835
836
837
838
839 if (is_writable_device_private_entry(entry) &&
840 is_cow_mapping(vm_flags)) {
841 entry = make_readable_device_private_entry(
842 swp_offset(entry));
843 pte = swp_entry_to_pte(entry);
844 if (pte_swp_uffd_wp(*src_pte))
845 pte = pte_swp_mkuffd_wp(pte);
846 set_pte_at(src_mm, addr, src_pte, pte);
847 }
848 } else if (is_device_exclusive_entry(entry)) {
849
850
851
852
853
854
855 VM_BUG_ON(!is_cow_mapping(src_vma->vm_flags));
856 if (try_restore_exclusive_pte(src_pte, src_vma, addr))
857 return -EBUSY;
858 return -ENOENT;
859 }
860 if (!userfaultfd_wp(dst_vma))
861 pte = pte_swp_clear_uffd_wp(pte);
862 set_pte_at(dst_mm, addr, dst_pte, pte);
863 return 0;
864}
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886static inline int
887copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
888 pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
889 struct page **prealloc, pte_t pte, struct page *page)
890{
891 struct page *new_page;
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906 if (likely(!page_needs_cow_for_dma(src_vma, page)))
907 return 1;
908
909 new_page = *prealloc;
910 if (!new_page)
911 return -EAGAIN;
912
913
914
915
916
917 *prealloc = NULL;
918 copy_user_highpage(new_page, page, addr, src_vma);
919 __SetPageUptodate(new_page);
920 page_add_new_anon_rmap(new_page, dst_vma, addr, false);
921 lru_cache_add_inactive_or_unevictable(new_page, dst_vma);
922 rss[mm_counter(new_page)]++;
923
924
925 pte = mk_pte(new_page, dst_vma->vm_page_prot);
926 pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma);
927 if (userfaultfd_pte_wp(dst_vma, *src_pte))
928
929 pte = pte_wrprotect(pte_mkuffd_wp(pte));
930 set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
931 return 0;
932}
933
934
935
936
937
938static inline int
939copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
940 pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
941 struct page **prealloc)
942{
943 struct mm_struct *src_mm = src_vma->vm_mm;
944 unsigned long vm_flags = src_vma->vm_flags;
945 pte_t pte = *src_pte;
946 struct page *page;
947
948 page = vm_normal_page(src_vma, addr, pte);
949 if (page) {
950 int retval;
951
952 retval = copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
953 addr, rss, prealloc, pte, page);
954 if (retval <= 0)
955 return retval;
956
957 get_page(page);
958 page_dup_rmap(page, false);
959 rss[mm_counter(page)]++;
960 }
961
962
963
964
965
966 if (is_cow_mapping(vm_flags) && pte_write(pte)) {
967 ptep_set_wrprotect(src_mm, addr, src_pte);
968 pte = pte_wrprotect(pte);
969 }
970
971
972
973
974
975 if (vm_flags & VM_SHARED)
976 pte = pte_mkclean(pte);
977 pte = pte_mkold(pte);
978
979 if (!userfaultfd_wp(dst_vma))
980 pte = pte_clear_uffd_wp(pte);
981
982 set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
983 return 0;
984}
985
986static inline struct page *
987page_copy_prealloc(struct mm_struct *src_mm, struct vm_area_struct *vma,
988 unsigned long addr)
989{
990 struct page *new_page;
991
992 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, addr);
993 if (!new_page)
994 return NULL;
995
996 if (mem_cgroup_charge(page_folio(new_page), src_mm, GFP_KERNEL)) {
997 put_page(new_page);
998 return NULL;
999 }
1000 cgroup_throttle_swaprate(new_page, GFP_KERNEL);
1001
1002 return new_page;
1003}
1004
1005static int
1006copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
1007 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
1008 unsigned long end)
1009{
1010 struct mm_struct *dst_mm = dst_vma->vm_mm;
1011 struct mm_struct *src_mm = src_vma->vm_mm;
1012 pte_t *orig_src_pte, *orig_dst_pte;
1013 pte_t *src_pte, *dst_pte;
1014 spinlock_t *src_ptl, *dst_ptl;
1015 int progress, ret = 0;
1016 int rss[NR_MM_COUNTERS];
1017 swp_entry_t entry = (swp_entry_t){0};
1018 struct page *prealloc = NULL;
1019
1020again:
1021 progress = 0;
1022 init_rss_vec(rss);
1023
1024 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
1025 if (!dst_pte) {
1026 ret = -ENOMEM;
1027 goto out;
1028 }
1029 src_pte = pte_offset_map(src_pmd, addr);
1030 src_ptl = pte_lockptr(src_mm, src_pmd);
1031 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
1032 orig_src_pte = src_pte;
1033 orig_dst_pte = dst_pte;
1034 arch_enter_lazy_mmu_mode();
1035
1036 do {
1037
1038
1039
1040
1041 if (progress >= 32) {
1042 progress = 0;
1043 if (need_resched() ||
1044 spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
1045 break;
1046 }
1047 if (pte_none(*src_pte)) {
1048 progress++;
1049 continue;
1050 }
1051 if (unlikely(!pte_present(*src_pte))) {
1052 ret = copy_nonpresent_pte(dst_mm, src_mm,
1053 dst_pte, src_pte,
1054 dst_vma, src_vma,
1055 addr, rss);
1056 if (ret == -EIO) {
1057 entry = pte_to_swp_entry(*src_pte);
1058 break;
1059 } else if (ret == -EBUSY) {
1060 break;
1061 } else if (!ret) {
1062 progress += 8;
1063 continue;
1064 }
1065
1066
1067
1068
1069
1070 WARN_ON_ONCE(ret != -ENOENT);
1071 }
1072
1073 ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte,
1074 addr, rss, &prealloc);
1075
1076
1077
1078
1079 if (unlikely(ret == -EAGAIN))
1080 break;
1081 if (unlikely(prealloc)) {
1082
1083
1084
1085
1086
1087
1088 put_page(prealloc);
1089 prealloc = NULL;
1090 }
1091 progress += 8;
1092 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
1093
1094 arch_leave_lazy_mmu_mode();
1095 spin_unlock(src_ptl);
1096 pte_unmap(orig_src_pte);
1097 add_mm_rss_vec(dst_mm, rss);
1098 pte_unmap_unlock(orig_dst_pte, dst_ptl);
1099 cond_resched();
1100
1101 if (ret == -EIO) {
1102 VM_WARN_ON_ONCE(!entry.val);
1103 if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) {
1104 ret = -ENOMEM;
1105 goto out;
1106 }
1107 entry.val = 0;
1108 } else if (ret == -EBUSY) {
1109 goto out;
1110 } else if (ret == -EAGAIN) {
1111 prealloc = page_copy_prealloc(src_mm, src_vma, addr);
1112 if (!prealloc)
1113 return -ENOMEM;
1114 } else if (ret) {
1115 VM_WARN_ON_ONCE(1);
1116 }
1117
1118
1119 ret = 0;
1120
1121 if (addr != end)
1122 goto again;
1123out:
1124 if (unlikely(prealloc))
1125 put_page(prealloc);
1126 return ret;
1127}
1128
1129static inline int
1130copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
1131 pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
1132 unsigned long end)
1133{
1134 struct mm_struct *dst_mm = dst_vma->vm_mm;
1135 struct mm_struct *src_mm = src_vma->vm_mm;
1136 pmd_t *src_pmd, *dst_pmd;
1137 unsigned long next;
1138
1139 dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
1140 if (!dst_pmd)
1141 return -ENOMEM;
1142 src_pmd = pmd_offset(src_pud, addr);
1143 do {
1144 next = pmd_addr_end(addr, end);
1145 if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)
1146 || pmd_devmap(*src_pmd)) {
1147 int err;
1148 VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma);
1149 err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd,
1150 addr, dst_vma, src_vma);
1151 if (err == -ENOMEM)
1152 return -ENOMEM;
1153 if (!err)
1154 continue;
1155
1156 }
1157 if (pmd_none_or_clear_bad(src_pmd))
1158 continue;
1159 if (copy_pte_range(dst_vma, src_vma, dst_pmd, src_pmd,
1160 addr, next))
1161 return -ENOMEM;
1162 } while (dst_pmd++, src_pmd++, addr = next, addr != end);
1163 return 0;
1164}
1165
1166static inline int
1167copy_pud_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
1168 p4d_t *dst_p4d, p4d_t *src_p4d, unsigned long addr,
1169 unsigned long end)
1170{
1171 struct mm_struct *dst_mm = dst_vma->vm_mm;
1172 struct mm_struct *src_mm = src_vma->vm_mm;
1173 pud_t *src_pud, *dst_pud;
1174 unsigned long next;
1175
1176 dst_pud = pud_alloc(dst_mm, dst_p4d, addr);
1177 if (!dst_pud)
1178 return -ENOMEM;
1179 src_pud = pud_offset(src_p4d, addr);
1180 do {
1181 next = pud_addr_end(addr, end);
1182 if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) {
1183 int err;
1184
1185 VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, src_vma);
1186 err = copy_huge_pud(dst_mm, src_mm,
1187 dst_pud, src_pud, addr, src_vma);
1188 if (err == -ENOMEM)
1189 return -ENOMEM;
1190 if (!err)
1191 continue;
1192
1193 }
1194 if (pud_none_or_clear_bad(src_pud))
1195 continue;
1196 if (copy_pmd_range(dst_vma, src_vma, dst_pud, src_pud,
1197 addr, next))
1198 return -ENOMEM;
1199 } while (dst_pud++, src_pud++, addr = next, addr != end);
1200 return 0;
1201}
1202
1203static inline int
1204copy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
1205 pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long addr,
1206 unsigned long end)
1207{
1208 struct mm_struct *dst_mm = dst_vma->vm_mm;
1209 p4d_t *src_p4d, *dst_p4d;
1210 unsigned long next;
1211
1212 dst_p4d = p4d_alloc(dst_mm, dst_pgd, addr);
1213 if (!dst_p4d)
1214 return -ENOMEM;
1215 src_p4d = p4d_offset(src_pgd, addr);
1216 do {
1217 next = p4d_addr_end(addr, end);
1218 if (p4d_none_or_clear_bad(src_p4d))
1219 continue;
1220 if (copy_pud_range(dst_vma, src_vma, dst_p4d, src_p4d,
1221 addr, next))
1222 return -ENOMEM;
1223 } while (dst_p4d++, src_p4d++, addr = next, addr != end);
1224 return 0;
1225}
1226
1227int
1228copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
1229{
1230 pgd_t *src_pgd, *dst_pgd;
1231 unsigned long next;
1232 unsigned long addr = src_vma->vm_start;
1233 unsigned long end = src_vma->vm_end;
1234 struct mm_struct *dst_mm = dst_vma->vm_mm;
1235 struct mm_struct *src_mm = src_vma->vm_mm;
1236 struct mmu_notifier_range range;
1237 bool is_cow;
1238 int ret;
1239
1240
1241
1242
1243
1244
1245
1246 if (!(src_vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) &&
1247 !src_vma->anon_vma)
1248 return 0;
1249
1250 if (is_vm_hugetlb_page(src_vma))
1251 return copy_hugetlb_page_range(dst_mm, src_mm, src_vma);
1252
1253 if (unlikely(src_vma->vm_flags & VM_PFNMAP)) {
1254
1255
1256
1257
1258 ret = track_pfn_copy(src_vma);
1259 if (ret)
1260 return ret;
1261 }
1262
1263
1264
1265
1266
1267
1268
1269 is_cow = is_cow_mapping(src_vma->vm_flags);
1270
1271 if (is_cow) {
1272 mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
1273 0, src_vma, src_mm, addr, end);
1274 mmu_notifier_invalidate_range_start(&range);
1275
1276
1277
1278
1279
1280
1281
1282 mmap_assert_write_locked(src_mm);
1283 raw_write_seqcount_begin(&src_mm->write_protect_seq);
1284 }
1285
1286 ret = 0;
1287 dst_pgd = pgd_offset(dst_mm, addr);
1288 src_pgd = pgd_offset(src_mm, addr);
1289 do {
1290 next = pgd_addr_end(addr, end);
1291 if (pgd_none_or_clear_bad(src_pgd))
1292 continue;
1293 if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd,
1294 addr, next))) {
1295 ret = -ENOMEM;
1296 break;
1297 }
1298 } while (dst_pgd++, src_pgd++, addr = next, addr != end);
1299
1300 if (is_cow) {
1301 raw_write_seqcount_end(&src_mm->write_protect_seq);
1302 mmu_notifier_invalidate_range_end(&range);
1303 }
1304 return ret;
1305}
1306
1307static unsigned long zap_pte_range(struct mmu_gather *tlb,
1308 struct vm_area_struct *vma, pmd_t *pmd,
1309 unsigned long addr, unsigned long end,
1310 struct zap_details *details)
1311{
1312 struct mm_struct *mm = tlb->mm;
1313 int force_flush = 0;
1314 int rss[NR_MM_COUNTERS];
1315 spinlock_t *ptl;
1316 pte_t *start_pte;
1317 pte_t *pte;
1318 swp_entry_t entry;
1319
1320 tlb_change_page_size(tlb, PAGE_SIZE);
1321again:
1322 init_rss_vec(rss);
1323 start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
1324 pte = start_pte;
1325 flush_tlb_batched_pending(mm);
1326 arch_enter_lazy_mmu_mode();
1327 do {
1328 pte_t ptent = *pte;
1329 if (pte_none(ptent))
1330 continue;
1331
1332 if (need_resched())
1333 break;
1334
1335 if (pte_present(ptent)) {
1336 struct page *page;
1337
1338 page = vm_normal_page(vma, addr, ptent);
1339 if (unlikely(zap_skip_check_mapping(details, page)))
1340 continue;
1341 ptent = ptep_get_and_clear_full(mm, addr, pte,
1342 tlb->fullmm);
1343 tlb_remove_tlb_entry(tlb, pte, addr);
1344 if (unlikely(!page))
1345 continue;
1346
1347 if (!PageAnon(page)) {
1348 if (pte_dirty(ptent)) {
1349 force_flush = 1;
1350 set_page_dirty(page);
1351 }
1352 if (pte_young(ptent) &&
1353 likely(!(vma->vm_flags & VM_SEQ_READ)))
1354 mark_page_accessed(page);
1355 }
1356 rss[mm_counter(page)]--;
1357 page_remove_rmap(page, false);
1358 if (unlikely(page_mapcount(page) < 0))
1359 print_bad_pte(vma, addr, ptent, page);
1360 if (unlikely(__tlb_remove_page(tlb, page))) {
1361 force_flush = 1;
1362 addr += PAGE_SIZE;
1363 break;
1364 }
1365 continue;
1366 }
1367
1368 entry = pte_to_swp_entry(ptent);
1369 if (is_device_private_entry(entry) ||
1370 is_device_exclusive_entry(entry)) {
1371 struct page *page = pfn_swap_entry_to_page(entry);
1372
1373 if (unlikely(zap_skip_check_mapping(details, page)))
1374 continue;
1375 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
1376 rss[mm_counter(page)]--;
1377
1378 if (is_device_private_entry(entry))
1379 page_remove_rmap(page, false);
1380
1381 put_page(page);
1382 continue;
1383 }
1384
1385
1386 if (unlikely(details))
1387 continue;
1388
1389 if (!non_swap_entry(entry))
1390 rss[MM_SWAPENTS]--;
1391 else if (is_migration_entry(entry)) {
1392 struct page *page;
1393
1394 page = pfn_swap_entry_to_page(entry);
1395 rss[mm_counter(page)]--;
1396 }
1397 if (unlikely(!free_swap_and_cache(entry)))
1398 print_bad_pte(vma, addr, ptent, NULL);
1399 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
1400 } while (pte++, addr += PAGE_SIZE, addr != end);
1401
1402 add_mm_rss_vec(mm, rss);
1403 arch_leave_lazy_mmu_mode();
1404
1405
1406 if (force_flush)
1407 tlb_flush_mmu_tlbonly(tlb);
1408 pte_unmap_unlock(start_pte, ptl);
1409
1410
1411
1412
1413
1414
1415
1416 if (force_flush) {
1417 force_flush = 0;
1418 tlb_flush_mmu(tlb);
1419 }
1420
1421 if (addr != end) {
1422 cond_resched();
1423 goto again;
1424 }
1425
1426 return addr;
1427}
1428
1429static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1430 struct vm_area_struct *vma, pud_t *pud,
1431 unsigned long addr, unsigned long end,
1432 struct zap_details *details)
1433{
1434 pmd_t *pmd;
1435 unsigned long next;
1436
1437 pmd = pmd_offset(pud, addr);
1438 do {
1439 next = pmd_addr_end(addr, end);
1440 if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
1441 if (next - addr != HPAGE_PMD_SIZE)
1442 __split_huge_pmd(vma, pmd, addr, false, NULL);
1443 else if (zap_huge_pmd(tlb, vma, pmd, addr))
1444 goto next;
1445
1446 } else if (details && details->single_page &&
1447 PageTransCompound(details->single_page) &&
1448 next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) {
1449 spinlock_t *ptl = pmd_lock(tlb->mm, pmd);
1450
1451
1452
1453
1454
1455 spin_unlock(ptl);
1456 }
1457
1458
1459
1460
1461
1462
1463
1464
1465 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
1466 goto next;
1467 next = zap_pte_range(tlb, vma, pmd, addr, next, details);
1468next:
1469 cond_resched();
1470 } while (pmd++, addr = next, addr != end);
1471
1472 return addr;
1473}
1474
1475static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1476 struct vm_area_struct *vma, p4d_t *p4d,
1477 unsigned long addr, unsigned long end,
1478 struct zap_details *details)
1479{
1480 pud_t *pud;
1481 unsigned long next;
1482
1483 pud = pud_offset(p4d, addr);
1484 do {
1485 next = pud_addr_end(addr, end);
1486 if (pud_trans_huge(*pud) || pud_devmap(*pud)) {
1487 if (next - addr != HPAGE_PUD_SIZE) {
1488 mmap_assert_locked(tlb->mm);
1489 split_huge_pud(vma, pud, addr);
1490 } else if (zap_huge_pud(tlb, vma, pud, addr))
1491 goto next;
1492
1493 }
1494 if (pud_none_or_clear_bad(pud))
1495 continue;
1496 next = zap_pmd_range(tlb, vma, pud, addr, next, details);
1497next:
1498 cond_resched();
1499 } while (pud++, addr = next, addr != end);
1500
1501 return addr;
1502}
1503
1504static inline unsigned long zap_p4d_range(struct mmu_gather *tlb,
1505 struct vm_area_struct *vma, pgd_t *pgd,
1506 unsigned long addr, unsigned long end,
1507 struct zap_details *details)
1508{
1509 p4d_t *p4d;
1510 unsigned long next;
1511
1512 p4d = p4d_offset(pgd, addr);
1513 do {
1514 next = p4d_addr_end(addr, end);
1515 if (p4d_none_or_clear_bad(p4d))
1516 continue;
1517 next = zap_pud_range(tlb, vma, p4d, addr, next, details);
1518 } while (p4d++, addr = next, addr != end);
1519
1520 return addr;
1521}
1522
1523void unmap_page_range(struct mmu_gather *tlb,
1524 struct vm_area_struct *vma,
1525 unsigned long addr, unsigned long end,
1526 struct zap_details *details)
1527{
1528 pgd_t *pgd;
1529 unsigned long next;
1530
1531 BUG_ON(addr >= end);
1532 tlb_start_vma(tlb, vma);
1533 pgd = pgd_offset(vma->vm_mm, addr);
1534 do {
1535 next = pgd_addr_end(addr, end);
1536 if (pgd_none_or_clear_bad(pgd))
1537 continue;
1538 next = zap_p4d_range(tlb, vma, pgd, addr, next, details);
1539 } while (pgd++, addr = next, addr != end);
1540 tlb_end_vma(tlb, vma);
1541}
1542
1543
1544static void unmap_single_vma(struct mmu_gather *tlb,
1545 struct vm_area_struct *vma, unsigned long start_addr,
1546 unsigned long end_addr,
1547 struct zap_details *details)
1548{
1549 unsigned long start = max(vma->vm_start, start_addr);
1550 unsigned long end;
1551
1552 if (start >= vma->vm_end)
1553 return;
1554 end = min(vma->vm_end, end_addr);
1555 if (end <= vma->vm_start)
1556 return;
1557
1558 if (vma->vm_file)
1559 uprobe_munmap(vma, start, end);
1560
1561 if (unlikely(vma->vm_flags & VM_PFNMAP))
1562 untrack_pfn(vma, 0, 0);
1563
1564 if (start != end) {
1565 if (unlikely(is_vm_hugetlb_page(vma))) {
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577 if (vma->vm_file) {
1578 i_mmap_lock_write(vma->vm_file->f_mapping);
1579 __unmap_hugepage_range_final(tlb, vma, start, end, NULL);
1580 i_mmap_unlock_write(vma->vm_file->f_mapping);
1581 }
1582 } else
1583 unmap_page_range(tlb, vma, start, end, details);
1584 }
1585}
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605void unmap_vmas(struct mmu_gather *tlb,
1606 struct vm_area_struct *vma, unsigned long start_addr,
1607 unsigned long end_addr)
1608{
1609 struct mmu_notifier_range range;
1610
1611 mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
1612 start_addr, end_addr);
1613 mmu_notifier_invalidate_range_start(&range);
1614 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
1615 unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
1616 mmu_notifier_invalidate_range_end(&range);
1617}
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627void zap_page_range(struct vm_area_struct *vma, unsigned long start,
1628 unsigned long size)
1629{
1630 struct mmu_notifier_range range;
1631 struct mmu_gather tlb;
1632
1633 lru_add_drain();
1634 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
1635 start, start + size);
1636 tlb_gather_mmu(&tlb, vma->vm_mm);
1637 update_hiwater_rss(vma->vm_mm);
1638 mmu_notifier_invalidate_range_start(&range);
1639 for ( ; vma && vma->vm_start < range.end; vma = vma->vm_next)
1640 unmap_single_vma(&tlb, vma, start, range.end, NULL);
1641 mmu_notifier_invalidate_range_end(&range);
1642 tlb_finish_mmu(&tlb);
1643}
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
1655 unsigned long size, struct zap_details *details)
1656{
1657 struct mmu_notifier_range range;
1658 struct mmu_gather tlb;
1659
1660 lru_add_drain();
1661 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
1662 address, address + size);
1663 tlb_gather_mmu(&tlb, vma->vm_mm);
1664 update_hiwater_rss(vma->vm_mm);
1665 mmu_notifier_invalidate_range_start(&range);
1666 unmap_single_vma(&tlb, vma, address, range.end, details);
1667 mmu_notifier_invalidate_range_end(&range);
1668 tlb_finish_mmu(&tlb);
1669}
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
1683 unsigned long size)
1684{
1685 if (address < vma->vm_start || address + size > vma->vm_end ||
1686 !(vma->vm_flags & VM_PFNMAP))
1687 return;
1688
1689 zap_page_range_single(vma, address, size, NULL);
1690}
1691EXPORT_SYMBOL_GPL(zap_vma_ptes);
1692
1693static pmd_t *walk_to_pmd(struct mm_struct *mm, unsigned long addr)
1694{
1695 pgd_t *pgd;
1696 p4d_t *p4d;
1697 pud_t *pud;
1698 pmd_t *pmd;
1699
1700 pgd = pgd_offset(mm, addr);
1701 p4d = p4d_alloc(mm, pgd, addr);
1702 if (!p4d)
1703 return NULL;
1704 pud = pud_alloc(mm, p4d, addr);
1705 if (!pud)
1706 return NULL;
1707 pmd = pmd_alloc(mm, pud, addr);
1708 if (!pmd)
1709 return NULL;
1710
1711 VM_BUG_ON(pmd_trans_huge(*pmd));
1712 return pmd;
1713}
1714
1715pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
1716 spinlock_t **ptl)
1717{
1718 pmd_t *pmd = walk_to_pmd(mm, addr);
1719
1720 if (!pmd)
1721 return NULL;
1722 return pte_alloc_map_lock(mm, pmd, addr, ptl);
1723}
1724
1725static int validate_page_before_insert(struct page *page)
1726{
1727 if (PageAnon(page) || PageSlab(page) || page_has_type(page))
1728 return -EINVAL;
1729 flush_dcache_page(page);
1730 return 0;
1731}
1732
1733static int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte,
1734 unsigned long addr, struct page *page, pgprot_t prot)
1735{
1736 if (!pte_none(*pte))
1737 return -EBUSY;
1738
1739 get_page(page);
1740 inc_mm_counter_fast(mm, mm_counter_file(page));
1741 page_add_file_rmap(page, false);
1742 set_pte_at(mm, addr, pte, mk_pte(page, prot));
1743 return 0;
1744}
1745
1746
1747
1748
1749
1750
1751
1752
1753static int insert_page(struct vm_area_struct *vma, unsigned long addr,
1754 struct page *page, pgprot_t prot)
1755{
1756 struct mm_struct *mm = vma->vm_mm;
1757 int retval;
1758 pte_t *pte;
1759 spinlock_t *ptl;
1760
1761 retval = validate_page_before_insert(page);
1762 if (retval)
1763 goto out;
1764 retval = -ENOMEM;
1765 pte = get_locked_pte(mm, addr, &ptl);
1766 if (!pte)
1767 goto out;
1768 retval = insert_page_into_pte_locked(mm, pte, addr, page, prot);
1769 pte_unmap_unlock(pte, ptl);
1770out:
1771 return retval;
1772}
1773
1774#ifdef pte_index
1775static int insert_page_in_batch_locked(struct mm_struct *mm, pte_t *pte,
1776 unsigned long addr, struct page *page, pgprot_t prot)
1777{
1778 int err;
1779
1780 if (!page_count(page))
1781 return -EINVAL;
1782 err = validate_page_before_insert(page);
1783 if (err)
1784 return err;
1785 return insert_page_into_pte_locked(mm, pte, addr, page, prot);
1786}
1787
1788
1789
1790
1791static int insert_pages(struct vm_area_struct *vma, unsigned long addr,
1792 struct page **pages, unsigned long *num, pgprot_t prot)
1793{
1794 pmd_t *pmd = NULL;
1795 pte_t *start_pte, *pte;
1796 spinlock_t *pte_lock;
1797 struct mm_struct *const mm = vma->vm_mm;
1798 unsigned long curr_page_idx = 0;
1799 unsigned long remaining_pages_total = *num;
1800 unsigned long pages_to_write_in_pmd;
1801 int ret;
1802more:
1803 ret = -EFAULT;
1804 pmd = walk_to_pmd(mm, addr);
1805 if (!pmd)
1806 goto out;
1807
1808 pages_to_write_in_pmd = min_t(unsigned long,
1809 remaining_pages_total, PTRS_PER_PTE - pte_index(addr));
1810
1811
1812 ret = -ENOMEM;
1813 if (pte_alloc(mm, pmd))
1814 goto out;
1815
1816 while (pages_to_write_in_pmd) {
1817 int pte_idx = 0;
1818 const int batch_size = min_t(int, pages_to_write_in_pmd, 8);
1819
1820 start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock);
1821 for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) {
1822 int err = insert_page_in_batch_locked(mm, pte,
1823 addr, pages[curr_page_idx], prot);
1824 if (unlikely(err)) {
1825 pte_unmap_unlock(start_pte, pte_lock);
1826 ret = err;
1827 remaining_pages_total -= pte_idx;
1828 goto out;
1829 }
1830 addr += PAGE_SIZE;
1831 ++curr_page_idx;
1832 }
1833 pte_unmap_unlock(start_pte, pte_lock);
1834 pages_to_write_in_pmd -= batch_size;
1835 remaining_pages_total -= batch_size;
1836 }
1837 if (remaining_pages_total)
1838 goto more;
1839 ret = 0;
1840out:
1841 *num = remaining_pages_total;
1842 return ret;
1843}
1844#endif
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
1862 struct page **pages, unsigned long *num)
1863{
1864#ifdef pte_index
1865 const unsigned long end_addr = addr + (*num * PAGE_SIZE) - 1;
1866
1867 if (addr < vma->vm_start || end_addr >= vma->vm_end)
1868 return -EFAULT;
1869 if (!(vma->vm_flags & VM_MIXEDMAP)) {
1870 BUG_ON(mmap_read_trylock(vma->vm_mm));
1871 BUG_ON(vma->vm_flags & VM_PFNMAP);
1872 vma->vm_flags |= VM_MIXEDMAP;
1873 }
1874
1875 return insert_pages(vma, addr, pages, num, vma->vm_page_prot);
1876#else
1877 unsigned long idx = 0, pgcount = *num;
1878 int err = -EINVAL;
1879
1880 for (; idx < pgcount; ++idx) {
1881 err = vm_insert_page(vma, addr + (PAGE_SIZE * idx), pages[idx]);
1882 if (err)
1883 break;
1884 }
1885 *num = pgcount - idx;
1886 return err;
1887#endif
1888}
1889EXPORT_SYMBOL(vm_insert_pages);
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
1921 struct page *page)
1922{
1923 if (addr < vma->vm_start || addr >= vma->vm_end)
1924 return -EFAULT;
1925 if (!page_count(page))
1926 return -EINVAL;
1927 if (!(vma->vm_flags & VM_MIXEDMAP)) {
1928 BUG_ON(mmap_read_trylock(vma->vm_mm));
1929 BUG_ON(vma->vm_flags & VM_PFNMAP);
1930 vma->vm_flags |= VM_MIXEDMAP;
1931 }
1932 return insert_page(vma, addr, page, vma->vm_page_prot);
1933}
1934EXPORT_SYMBOL(vm_insert_page);
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947static int __vm_map_pages(struct vm_area_struct *vma, struct page **pages,
1948 unsigned long num, unsigned long offset)
1949{
1950 unsigned long count = vma_pages(vma);
1951 unsigned long uaddr = vma->vm_start;
1952 int ret, i;
1953
1954
1955 if (offset >= num)
1956 return -ENXIO;
1957
1958
1959 if (count > num - offset)
1960 return -ENXIO;
1961
1962 for (i = 0; i < count; i++) {
1963 ret = vm_insert_page(vma, uaddr, pages[offset + i]);
1964 if (ret < 0)
1965 return ret;
1966 uaddr += PAGE_SIZE;
1967 }
1968
1969 return 0;
1970}
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
1991 unsigned long num)
1992{
1993 return __vm_map_pages(vma, pages, num, vma->vm_pgoff);
1994}
1995EXPORT_SYMBOL(vm_map_pages);
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
2011 unsigned long num)
2012{
2013 return __vm_map_pages(vma, pages, num, 0);
2014}
2015EXPORT_SYMBOL(vm_map_pages_zero);
2016
2017static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
2018 pfn_t pfn, pgprot_t prot, bool mkwrite)
2019{
2020 struct mm_struct *mm = vma->vm_mm;
2021 pte_t *pte, entry;
2022 spinlock_t *ptl;
2023
2024 pte = get_locked_pte(mm, addr, &ptl);
2025 if (!pte)
2026 return VM_FAULT_OOM;
2027 if (!pte_none(*pte)) {
2028 if (mkwrite) {
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039 if (pte_pfn(*pte) != pfn_t_to_pfn(pfn)) {
2040 WARN_ON_ONCE(!is_zero_pfn(pte_pfn(*pte)));
2041 goto out_unlock;
2042 }
2043 entry = pte_mkyoung(*pte);
2044 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2045 if (ptep_set_access_flags(vma, addr, pte, entry, 1))
2046 update_mmu_cache(vma, addr, pte);
2047 }
2048 goto out_unlock;
2049 }
2050
2051
2052 if (pfn_t_devmap(pfn))
2053 entry = pte_mkdevmap(pfn_t_pte(pfn, prot));
2054 else
2055 entry = pte_mkspecial(pfn_t_pte(pfn, prot));
2056
2057 if (mkwrite) {
2058 entry = pte_mkyoung(entry);
2059 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2060 }
2061
2062 set_pte_at(mm, addr, pte, entry);
2063 update_mmu_cache(vma, addr, pte);
2064
2065out_unlock:
2066 pte_unmap_unlock(pte, ptl);
2067 return VM_FAULT_NOPAGE;
2068}
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
2092 unsigned long pfn, pgprot_t pgprot)
2093{
2094
2095
2096
2097
2098
2099
2100 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
2101 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
2102 (VM_PFNMAP|VM_MIXEDMAP));
2103 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
2104 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
2105
2106 if (addr < vma->vm_start || addr >= vma->vm_end)
2107 return VM_FAULT_SIGBUS;
2108
2109 if (!pfn_modify_allowed(pfn, pgprot))
2110 return VM_FAULT_SIGBUS;
2111
2112 track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
2113
2114 return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
2115 false);
2116}
2117EXPORT_SYMBOL(vmf_insert_pfn_prot);
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
2140 unsigned long pfn)
2141{
2142 return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
2143}
2144EXPORT_SYMBOL(vmf_insert_pfn);
2145
2146static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
2147{
2148
2149 if (vma->vm_flags & VM_MIXEDMAP)
2150 return true;
2151 if (pfn_t_devmap(pfn))
2152 return true;
2153 if (pfn_t_special(pfn))
2154 return true;
2155 if (is_zero_pfn(pfn_t_to_pfn(pfn)))
2156 return true;
2157 return false;
2158}
2159
2160static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
2161 unsigned long addr, pfn_t pfn, pgprot_t pgprot,
2162 bool mkwrite)
2163{
2164 int err;
2165
2166 BUG_ON(!vm_mixed_ok(vma, pfn));
2167
2168 if (addr < vma->vm_start || addr >= vma->vm_end)
2169 return VM_FAULT_SIGBUS;
2170
2171 track_pfn_insert(vma, &pgprot, pfn);
2172
2173 if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot))
2174 return VM_FAULT_SIGBUS;
2175
2176
2177
2178
2179
2180
2181
2182
2183 if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) &&
2184 !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) {
2185 struct page *page;
2186
2187
2188
2189
2190
2191
2192 page = pfn_to_page(pfn_t_to_pfn(pfn));
2193 err = insert_page(vma, addr, page, pgprot);
2194 } else {
2195 return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
2196 }
2197
2198 if (err == -ENOMEM)
2199 return VM_FAULT_OOM;
2200 if (err < 0 && err != -EBUSY)
2201 return VM_FAULT_SIGBUS;
2202
2203 return VM_FAULT_NOPAGE;
2204}
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232vm_fault_t vmf_insert_mixed_prot(struct vm_area_struct *vma, unsigned long addr,
2233 pfn_t pfn, pgprot_t pgprot)
2234{
2235 return __vm_insert_mixed(vma, addr, pfn, pgprot, false);
2236}
2237EXPORT_SYMBOL(vmf_insert_mixed_prot);
2238
2239vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
2240 pfn_t pfn)
2241{
2242 return __vm_insert_mixed(vma, addr, pfn, vma->vm_page_prot, false);
2243}
2244EXPORT_SYMBOL(vmf_insert_mixed);
2245
2246
2247
2248
2249
2250
2251vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
2252 unsigned long addr, pfn_t pfn)
2253{
2254 return __vm_insert_mixed(vma, addr, pfn, vma->vm_page_prot, true);
2255}
2256EXPORT_SYMBOL(vmf_insert_mixed_mkwrite);
2257
2258
2259
2260
2261
2262
2263static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
2264 unsigned long addr, unsigned long end,
2265 unsigned long pfn, pgprot_t prot)
2266{
2267 pte_t *pte, *mapped_pte;
2268 spinlock_t *ptl;
2269 int err = 0;
2270
2271 mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
2272 if (!pte)
2273 return -ENOMEM;
2274 arch_enter_lazy_mmu_mode();
2275 do {
2276 BUG_ON(!pte_none(*pte));
2277 if (!pfn_modify_allowed(pfn, prot)) {
2278 err = -EACCES;
2279 break;
2280 }
2281 set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
2282 pfn++;
2283 } while (pte++, addr += PAGE_SIZE, addr != end);
2284 arch_leave_lazy_mmu_mode();
2285 pte_unmap_unlock(mapped_pte, ptl);
2286 return err;
2287}
2288
2289static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
2290 unsigned long addr, unsigned long end,
2291 unsigned long pfn, pgprot_t prot)
2292{
2293 pmd_t *pmd;
2294 unsigned long next;
2295 int err;
2296
2297 pfn -= addr >> PAGE_SHIFT;
2298 pmd = pmd_alloc(mm, pud, addr);
2299 if (!pmd)
2300 return -ENOMEM;
2301 VM_BUG_ON(pmd_trans_huge(*pmd));
2302 do {
2303 next = pmd_addr_end(addr, end);
2304 err = remap_pte_range(mm, pmd, addr, next,
2305 pfn + (addr >> PAGE_SHIFT), prot);
2306 if (err)
2307 return err;
2308 } while (pmd++, addr = next, addr != end);
2309 return 0;
2310}
2311
2312static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d,
2313 unsigned long addr, unsigned long end,
2314 unsigned long pfn, pgprot_t prot)
2315{
2316 pud_t *pud;
2317 unsigned long next;
2318 int err;
2319
2320 pfn -= addr >> PAGE_SHIFT;
2321 pud = pud_alloc(mm, p4d, addr);
2322 if (!pud)
2323 return -ENOMEM;
2324 do {
2325 next = pud_addr_end(addr, end);
2326 err = remap_pmd_range(mm, pud, addr, next,
2327 pfn + (addr >> PAGE_SHIFT), prot);
2328 if (err)
2329 return err;
2330 } while (pud++, addr = next, addr != end);
2331 return 0;
2332}
2333
2334static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
2335 unsigned long addr, unsigned long end,
2336 unsigned long pfn, pgprot_t prot)
2337{
2338 p4d_t *p4d;
2339 unsigned long next;
2340 int err;
2341
2342 pfn -= addr >> PAGE_SHIFT;
2343 p4d = p4d_alloc(mm, pgd, addr);
2344 if (!p4d)
2345 return -ENOMEM;
2346 do {
2347 next = p4d_addr_end(addr, end);
2348 err = remap_pud_range(mm, p4d, addr, next,
2349 pfn + (addr >> PAGE_SHIFT), prot);
2350 if (err)
2351 return err;
2352 } while (p4d++, addr = next, addr != end);
2353 return 0;
2354}
2355
2356
2357
2358
2359
2360int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
2361 unsigned long pfn, unsigned long size, pgprot_t prot)
2362{
2363 pgd_t *pgd;
2364 unsigned long next;
2365 unsigned long end = addr + PAGE_ALIGN(size);
2366 struct mm_struct *mm = vma->vm_mm;
2367 int err;
2368
2369 if (WARN_ON_ONCE(!PAGE_ALIGNED(addr)))
2370 return -EINVAL;
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390 if (is_cow_mapping(vma->vm_flags)) {
2391 if (addr != vma->vm_start || end != vma->vm_end)
2392 return -EINVAL;
2393 vma->vm_pgoff = pfn;
2394 }
2395
2396 vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
2397
2398 BUG_ON(addr >= end);
2399 pfn -= addr >> PAGE_SHIFT;
2400 pgd = pgd_offset(mm, addr);
2401 flush_cache_range(vma, addr, end);
2402 do {
2403 next = pgd_addr_end(addr, end);
2404 err = remap_p4d_range(mm, pgd, addr, next,
2405 pfn + (addr >> PAGE_SHIFT), prot);
2406 if (err)
2407 return err;
2408 } while (pgd++, addr = next, addr != end);
2409
2410 return 0;
2411}
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
2426 unsigned long pfn, unsigned long size, pgprot_t prot)
2427{
2428 int err;
2429
2430 err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));
2431 if (err)
2432 return -EINVAL;
2433
2434 err = remap_pfn_range_notrack(vma, addr, pfn, size, prot);
2435 if (err)
2436 untrack_pfn(vma, pfn, PAGE_ALIGN(size));
2437 return err;
2438}
2439EXPORT_SYMBOL(remap_pfn_range);
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
2457{
2458 unsigned long vm_len, pfn, pages;
2459
2460
2461 if (start + len < start)
2462 return -EINVAL;
2463
2464
2465
2466
2467
2468 len += start & ~PAGE_MASK;
2469 pfn = start >> PAGE_SHIFT;
2470 pages = (len + ~PAGE_MASK) >> PAGE_SHIFT;
2471 if (pfn + pages < pfn)
2472 return -EINVAL;
2473
2474
2475 if (vma->vm_pgoff > pages)
2476 return -EINVAL;
2477 pfn += vma->vm_pgoff;
2478 pages -= vma->vm_pgoff;
2479
2480
2481 vm_len = vma->vm_end - vma->vm_start;
2482 if (vm_len >> PAGE_SHIFT > pages)
2483 return -EINVAL;
2484
2485
2486 return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
2487}
2488EXPORT_SYMBOL(vm_iomap_memory);
2489
2490static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
2491 unsigned long addr, unsigned long end,
2492 pte_fn_t fn, void *data, bool create,
2493 pgtbl_mod_mask *mask)
2494{
2495 pte_t *pte, *mapped_pte;
2496 int err = 0;
2497 spinlock_t *ptl;
2498
2499 if (create) {
2500 mapped_pte = pte = (mm == &init_mm) ?
2501 pte_alloc_kernel_track(pmd, addr, mask) :
2502 pte_alloc_map_lock(mm, pmd, addr, &ptl);
2503 if (!pte)
2504 return -ENOMEM;
2505 } else {
2506 mapped_pte = pte = (mm == &init_mm) ?
2507 pte_offset_kernel(pmd, addr) :
2508 pte_offset_map_lock(mm, pmd, addr, &ptl);
2509 }
2510
2511 BUG_ON(pmd_huge(*pmd));
2512
2513 arch_enter_lazy_mmu_mode();
2514
2515 if (fn) {
2516 do {
2517 if (create || !pte_none(*pte)) {
2518 err = fn(pte++, addr, data);
2519 if (err)
2520 break;
2521 }
2522 } while (addr += PAGE_SIZE, addr != end);
2523 }
2524 *mask |= PGTBL_PTE_MODIFIED;
2525
2526 arch_leave_lazy_mmu_mode();
2527
2528 if (mm != &init_mm)
2529 pte_unmap_unlock(mapped_pte, ptl);
2530 return err;
2531}
2532
2533static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
2534 unsigned long addr, unsigned long end,
2535 pte_fn_t fn, void *data, bool create,
2536 pgtbl_mod_mask *mask)
2537{
2538 pmd_t *pmd;
2539 unsigned long next;
2540 int err = 0;
2541
2542 BUG_ON(pud_huge(*pud));
2543
2544 if (create) {
2545 pmd = pmd_alloc_track(mm, pud, addr, mask);
2546 if (!pmd)
2547 return -ENOMEM;
2548 } else {
2549 pmd = pmd_offset(pud, addr);
2550 }
2551 do {
2552 next = pmd_addr_end(addr, end);
2553 if (pmd_none(*pmd) && !create)
2554 continue;
2555 if (WARN_ON_ONCE(pmd_leaf(*pmd)))
2556 return -EINVAL;
2557 if (!pmd_none(*pmd) && WARN_ON_ONCE(pmd_bad(*pmd))) {
2558 if (!create)
2559 continue;
2560 pmd_clear_bad(pmd);
2561 }
2562 err = apply_to_pte_range(mm, pmd, addr, next,
2563 fn, data, create, mask);
2564 if (err)
2565 break;
2566 } while (pmd++, addr = next, addr != end);
2567
2568 return err;
2569}
2570
2571static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
2572 unsigned long addr, unsigned long end,
2573 pte_fn_t fn, void *data, bool create,
2574 pgtbl_mod_mask *mask)
2575{
2576 pud_t *pud;
2577 unsigned long next;
2578 int err = 0;
2579
2580 if (create) {
2581 pud = pud_alloc_track(mm, p4d, addr, mask);
2582 if (!pud)
2583 return -ENOMEM;
2584 } else {
2585 pud = pud_offset(p4d, addr);
2586 }
2587 do {
2588 next = pud_addr_end(addr, end);
2589 if (pud_none(*pud) && !create)
2590 continue;
2591 if (WARN_ON_ONCE(pud_leaf(*pud)))
2592 return -EINVAL;
2593 if (!pud_none(*pud) && WARN_ON_ONCE(pud_bad(*pud))) {
2594 if (!create)
2595 continue;
2596 pud_clear_bad(pud);
2597 }
2598 err = apply_to_pmd_range(mm, pud, addr, next,
2599 fn, data, create, mask);
2600 if (err)
2601 break;
2602 } while (pud++, addr = next, addr != end);
2603
2604 return err;
2605}
2606
2607static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
2608 unsigned long addr, unsigned long end,
2609 pte_fn_t fn, void *data, bool create,
2610 pgtbl_mod_mask *mask)
2611{
2612 p4d_t *p4d;
2613 unsigned long next;
2614 int err = 0;
2615
2616 if (create) {
2617 p4d = p4d_alloc_track(mm, pgd, addr, mask);
2618 if (!p4d)
2619 return -ENOMEM;
2620 } else {
2621 p4d = p4d_offset(pgd, addr);
2622 }
2623 do {
2624 next = p4d_addr_end(addr, end);
2625 if (p4d_none(*p4d) && !create)
2626 continue;
2627 if (WARN_ON_ONCE(p4d_leaf(*p4d)))
2628 return -EINVAL;
2629 if (!p4d_none(*p4d) && WARN_ON_ONCE(p4d_bad(*p4d))) {
2630 if (!create)
2631 continue;
2632 p4d_clear_bad(p4d);
2633 }
2634 err = apply_to_pud_range(mm, p4d, addr, next,
2635 fn, data, create, mask);
2636 if (err)
2637 break;
2638 } while (p4d++, addr = next, addr != end);
2639
2640 return err;
2641}
2642
2643static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
2644 unsigned long size, pte_fn_t fn,
2645 void *data, bool create)
2646{
2647 pgd_t *pgd;
2648 unsigned long start = addr, next;
2649 unsigned long end = addr + size;
2650 pgtbl_mod_mask mask = 0;
2651 int err = 0;
2652
2653 if (WARN_ON(addr >= end))
2654 return -EINVAL;
2655
2656 pgd = pgd_offset(mm, addr);
2657 do {
2658 next = pgd_addr_end(addr, end);
2659 if (pgd_none(*pgd) && !create)
2660 continue;
2661 if (WARN_ON_ONCE(pgd_leaf(*pgd)))
2662 return -EINVAL;
2663 if (!pgd_none(*pgd) && WARN_ON_ONCE(pgd_bad(*pgd))) {
2664 if (!create)
2665 continue;
2666 pgd_clear_bad(pgd);
2667 }
2668 err = apply_to_p4d_range(mm, pgd, addr, next,
2669 fn, data, create, &mask);
2670 if (err)
2671 break;
2672 } while (pgd++, addr = next, addr != end);
2673
2674 if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
2675 arch_sync_kernel_mappings(start, start + size);
2676
2677 return err;
2678}
2679
2680
2681
2682
2683
2684int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
2685 unsigned long size, pte_fn_t fn, void *data)
2686{
2687 return __apply_to_page_range(mm, addr, size, fn, data, true);
2688}
2689EXPORT_SYMBOL_GPL(apply_to_page_range);
2690
2691
2692
2693
2694
2695
2696
2697
2698int apply_to_existing_page_range(struct mm_struct *mm, unsigned long addr,
2699 unsigned long size, pte_fn_t fn, void *data)
2700{
2701 return __apply_to_page_range(mm, addr, size, fn, data, false);
2702}
2703EXPORT_SYMBOL_GPL(apply_to_existing_page_range);
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713static inline int pte_unmap_same(struct vm_fault *vmf)
2714{
2715 int same = 1;
2716#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPTION)
2717 if (sizeof(pte_t) > sizeof(unsigned long)) {
2718 spinlock_t *ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
2719 spin_lock(ptl);
2720 same = pte_same(*vmf->pte, vmf->orig_pte);
2721 spin_unlock(ptl);
2722 }
2723#endif
2724 pte_unmap(vmf->pte);
2725 vmf->pte = NULL;
2726 return same;
2727}
2728
2729static inline bool cow_user_page(struct page *dst, struct page *src,
2730 struct vm_fault *vmf)
2731{
2732 bool ret;
2733 void *kaddr;
2734 void __user *uaddr;
2735 bool locked = false;
2736 struct vm_area_struct *vma = vmf->vma;
2737 struct mm_struct *mm = vma->vm_mm;
2738 unsigned long addr = vmf->address;
2739
2740 if (likely(src)) {
2741 copy_user_highpage(dst, src, addr, vma);
2742 return true;
2743 }
2744
2745
2746
2747
2748
2749
2750
2751 kaddr = kmap_atomic(dst);
2752 uaddr = (void __user *)(addr & PAGE_MASK);
2753
2754
2755
2756
2757
2758 if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) {
2759 pte_t entry;
2760
2761 vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
2762 locked = true;
2763 if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
2764
2765
2766
2767
2768 update_mmu_tlb(vma, addr, vmf->pte);
2769 ret = false;
2770 goto pte_unlock;
2771 }
2772
2773 entry = pte_mkyoung(vmf->orig_pte);
2774 if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0))
2775 update_mmu_cache(vma, addr, vmf->pte);
2776 }
2777
2778
2779
2780
2781
2782
2783
2784 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
2785 if (locked)
2786 goto warn;
2787
2788
2789 vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
2790 locked = true;
2791 if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
2792
2793 update_mmu_tlb(vma, addr, vmf->pte);
2794 ret = false;
2795 goto pte_unlock;
2796 }
2797
2798
2799
2800
2801
2802 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
2803
2804
2805
2806
2807warn:
2808 WARN_ON_ONCE(1);
2809 clear_page(kaddr);
2810 }
2811 }
2812
2813 ret = true;
2814
2815pte_unlock:
2816 if (locked)
2817 pte_unmap_unlock(vmf->pte, vmf->ptl);
2818 kunmap_atomic(kaddr);
2819 flush_dcache_page(dst);
2820
2821 return ret;
2822}
2823
2824static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
2825{
2826 struct file *vm_file = vma->vm_file;
2827
2828 if (vm_file)
2829 return mapping_gfp_mask(vm_file->f_mapping) | __GFP_FS | __GFP_IO;
2830
2831
2832
2833
2834
2835 return GFP_KERNEL;
2836}
2837
2838
2839
2840
2841
2842
2843
2844static vm_fault_t do_page_mkwrite(struct vm_fault *vmf)
2845{
2846 vm_fault_t ret;
2847 struct page *page = vmf->page;
2848 unsigned int old_flags = vmf->flags;
2849
2850 vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
2851
2852 if (vmf->vma->vm_file &&
2853 IS_SWAPFILE(vmf->vma->vm_file->f_mapping->host))
2854 return VM_FAULT_SIGBUS;
2855
2856 ret = vmf->vma->vm_ops->page_mkwrite(vmf);
2857
2858 vmf->flags = old_flags;
2859 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
2860 return ret;
2861 if (unlikely(!(ret & VM_FAULT_LOCKED))) {
2862 lock_page(page);
2863 if (!page->mapping) {
2864 unlock_page(page);
2865 return 0;
2866 }
2867 ret |= VM_FAULT_LOCKED;
2868 } else
2869 VM_BUG_ON_PAGE(!PageLocked(page), page);
2870 return ret;
2871}
2872
2873
2874
2875
2876
2877
2878static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf)
2879{
2880 struct vm_area_struct *vma = vmf->vma;
2881 struct address_space *mapping;
2882 struct page *page = vmf->page;
2883 bool dirtied;
2884 bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;
2885
2886 dirtied = set_page_dirty(page);
2887 VM_BUG_ON_PAGE(PageAnon(page), page);
2888
2889
2890
2891
2892
2893
2894 mapping = page_rmapping(page);
2895 unlock_page(page);
2896
2897 if (!page_mkwrite)
2898 file_update_time(vma->vm_file);
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909 if ((dirtied || page_mkwrite) && mapping) {
2910 struct file *fpin;
2911
2912 fpin = maybe_unlock_mmap_for_io(vmf, NULL);
2913 balance_dirty_pages_ratelimited(mapping);
2914 if (fpin) {
2915 fput(fpin);
2916 return VM_FAULT_RETRY;
2917 }
2918 }
2919
2920 return 0;
2921}
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931static inline void wp_page_reuse(struct vm_fault *vmf)
2932 __releases(vmf->ptl)
2933{
2934 struct vm_area_struct *vma = vmf->vma;
2935 struct page *page = vmf->page;
2936 pte_t entry;
2937
2938
2939
2940
2941
2942 if (page)
2943 page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);
2944
2945 flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
2946 entry = pte_mkyoung(vmf->orig_pte);
2947 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2948 if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
2949 update_mmu_cache(vma, vmf->address, vmf->pte);
2950 pte_unmap_unlock(vmf->pte, vmf->ptl);
2951 count_vm_event(PGREUSE);
2952}
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970static vm_fault_t wp_page_copy(struct vm_fault *vmf)
2971{
2972 struct vm_area_struct *vma = vmf->vma;
2973 struct mm_struct *mm = vma->vm_mm;
2974 struct page *old_page = vmf->page;
2975 struct page *new_page = NULL;
2976 pte_t entry;
2977 int page_copied = 0;
2978 struct mmu_notifier_range range;
2979
2980 if (unlikely(anon_vma_prepare(vma)))
2981 goto oom;
2982
2983 if (is_zero_pfn(pte_pfn(vmf->orig_pte))) {
2984 new_page = alloc_zeroed_user_highpage_movable(vma,
2985 vmf->address);
2986 if (!new_page)
2987 goto oom;
2988 } else {
2989 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
2990 vmf->address);
2991 if (!new_page)
2992 goto oom;
2993
2994 if (!cow_user_page(new_page, old_page, vmf)) {
2995
2996
2997
2998
2999
3000
3001 put_page(new_page);
3002 if (old_page)
3003 put_page(old_page);
3004 return 0;
3005 }
3006 }
3007
3008 if (mem_cgroup_charge(page_folio(new_page), mm, GFP_KERNEL))
3009 goto oom_free_new;
3010 cgroup_throttle_swaprate(new_page, GFP_KERNEL);
3011
3012 __SetPageUptodate(new_page);
3013
3014 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
3015 vmf->address & PAGE_MASK,
3016 (vmf->address & PAGE_MASK) + PAGE_SIZE);
3017 mmu_notifier_invalidate_range_start(&range);
3018
3019
3020
3021
3022 vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
3023 if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
3024 if (old_page) {
3025 if (!PageAnon(old_page)) {
3026 dec_mm_counter_fast(mm,
3027 mm_counter_file(old_page));
3028 inc_mm_counter_fast(mm, MM_ANONPAGES);
3029 }
3030 } else {
3031 inc_mm_counter_fast(mm, MM_ANONPAGES);
3032 }
3033 flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
3034 entry = mk_pte(new_page, vma->vm_page_prot);
3035 entry = pte_sw_mkyoung(entry);
3036 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
3037
3038
3039
3040
3041
3042
3043
3044
3045 ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
3046 page_add_new_anon_rmap(new_page, vma, vmf->address, false);
3047 lru_cache_add_inactive_or_unevictable(new_page, vma);
3048
3049
3050
3051
3052
3053 set_pte_at_notify(mm, vmf->address, vmf->pte, entry);
3054 update_mmu_cache(vma, vmf->address, vmf->pte);
3055 if (old_page) {
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078 page_remove_rmap(old_page, false);
3079 }
3080
3081
3082 new_page = old_page;
3083 page_copied = 1;
3084 } else {
3085 update_mmu_tlb(vma, vmf->address, vmf->pte);
3086 }
3087
3088 if (new_page)
3089 put_page(new_page);
3090
3091 pte_unmap_unlock(vmf->pte, vmf->ptl);
3092
3093
3094
3095
3096 mmu_notifier_invalidate_range_only_end(&range);
3097 if (old_page) {
3098
3099
3100
3101
3102 if (page_copied && (vma->vm_flags & VM_LOCKED)) {
3103 lock_page(old_page);
3104 if (PageMlocked(old_page))
3105 munlock_vma_page(old_page);
3106 unlock_page(old_page);
3107 }
3108 if (page_copied)
3109 free_swap_cache(old_page);
3110 put_page(old_page);
3111 }
3112 return page_copied ? VM_FAULT_WRITE : 0;
3113oom_free_new:
3114 put_page(new_page);
3115oom:
3116 if (old_page)
3117 put_page(old_page);
3118 return VM_FAULT_OOM;
3119}
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf)
3138{
3139 WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
3140 vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
3141 &vmf->ptl);
3142
3143
3144
3145
3146 if (!pte_same(*vmf->pte, vmf->orig_pte)) {
3147 update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
3148 pte_unmap_unlock(vmf->pte, vmf->ptl);
3149 return VM_FAULT_NOPAGE;
3150 }
3151 wp_page_reuse(vmf);
3152 return 0;
3153}
3154
3155
3156
3157
3158
3159static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
3160{
3161 struct vm_area_struct *vma = vmf->vma;
3162
3163 if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
3164 vm_fault_t ret;
3165
3166 pte_unmap_unlock(vmf->pte, vmf->ptl);
3167 vmf->flags |= FAULT_FLAG_MKWRITE;
3168 ret = vma->vm_ops->pfn_mkwrite(vmf);
3169 if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
3170 return ret;
3171 return finish_mkwrite_fault(vmf);
3172 }
3173 wp_page_reuse(vmf);
3174 return VM_FAULT_WRITE;
3175}
3176
3177static vm_fault_t wp_page_shared(struct vm_fault *vmf)
3178 __releases(vmf->ptl)
3179{
3180 struct vm_area_struct *vma = vmf->vma;
3181 vm_fault_t ret = VM_FAULT_WRITE;
3182
3183 get_page(vmf->page);
3184
3185 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
3186 vm_fault_t tmp;
3187
3188 pte_unmap_unlock(vmf->pte, vmf->ptl);
3189 tmp = do_page_mkwrite(vmf);
3190 if (unlikely(!tmp || (tmp &
3191 (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
3192 put_page(vmf->page);
3193 return tmp;
3194 }
3195 tmp = finish_mkwrite_fault(vmf);
3196 if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
3197 unlock_page(vmf->page);
3198 put_page(vmf->page);
3199 return tmp;
3200 }
3201 } else {
3202 wp_page_reuse(vmf);
3203 lock_page(vmf->page);
3204 }
3205 ret |= fault_dirty_shared_page(vmf);
3206 put_page(vmf->page);
3207
3208 return ret;
3209}
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229static vm_fault_t do_wp_page(struct vm_fault *vmf)
3230 __releases(vmf->ptl)
3231{
3232 struct vm_area_struct *vma = vmf->vma;
3233
3234 if (userfaultfd_pte_wp(vma, *vmf->pte)) {
3235 pte_unmap_unlock(vmf->pte, vmf->ptl);
3236 return handle_userfault(vmf, VM_UFFD_WP);
3237 }
3238
3239
3240
3241
3242
3243 if (unlikely(userfaultfd_wp(vmf->vma) &&
3244 mm_tlb_flush_pending(vmf->vma->vm_mm)))
3245 flush_tlb_page(vmf->vma, vmf->address);
3246
3247 vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
3248 if (!vmf->page) {
3249
3250
3251
3252
3253
3254
3255
3256 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
3257 (VM_WRITE|VM_SHARED))
3258 return wp_pfn_shared(vmf);
3259
3260 pte_unmap_unlock(vmf->pte, vmf->ptl);
3261 return wp_page_copy(vmf);
3262 }
3263
3264
3265
3266
3267
3268 if (PageAnon(vmf->page)) {
3269 struct page *page = vmf->page;
3270
3271
3272 if (PageKsm(page) || page_count(page) != 1)
3273 goto copy;
3274 if (!trylock_page(page))
3275 goto copy;
3276 if (PageKsm(page) || page_mapcount(page) != 1 || page_count(page) != 1) {
3277 unlock_page(page);
3278 goto copy;
3279 }
3280
3281
3282
3283
3284
3285 unlock_page(page);
3286 wp_page_reuse(vmf);
3287 return VM_FAULT_WRITE;
3288 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
3289 (VM_WRITE|VM_SHARED))) {
3290 return wp_page_shared(vmf);
3291 }
3292copy:
3293
3294
3295
3296 get_page(vmf->page);
3297
3298 pte_unmap_unlock(vmf->pte, vmf->ptl);
3299 return wp_page_copy(vmf);
3300}
3301
3302static void unmap_mapping_range_vma(struct vm_area_struct *vma,
3303 unsigned long start_addr, unsigned long end_addr,
3304 struct zap_details *details)
3305{
3306 zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
3307}
3308
3309static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
3310 pgoff_t first_index,
3311 pgoff_t last_index,
3312 struct zap_details *details)
3313{
3314 struct vm_area_struct *vma;
3315 pgoff_t vba, vea, zba, zea;
3316
3317 vma_interval_tree_foreach(vma, root, first_index, last_index) {
3318 vba = vma->vm_pgoff;
3319 vea = vba + vma_pages(vma) - 1;
3320 zba = first_index;
3321 if (zba < vba)
3322 zba = vba;
3323 zea = last_index;
3324 if (zea > vea)
3325 zea = vea;
3326
3327 unmap_mapping_range_vma(vma,
3328 ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
3329 ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
3330 details);
3331 }
3332}
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345void unmap_mapping_page(struct page *page)
3346{
3347 struct address_space *mapping = page->mapping;
3348 struct zap_details details = { };
3349 pgoff_t first_index;
3350 pgoff_t last_index;
3351
3352 VM_BUG_ON(!PageLocked(page));
3353 VM_BUG_ON(PageTail(page));
3354
3355 first_index = page->index;
3356 last_index = page->index + thp_nr_pages(page) - 1;
3357
3358 details.zap_mapping = mapping;
3359 details.single_page = page;
3360
3361 i_mmap_lock_write(mapping);
3362 if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
3363 unmap_mapping_range_tree(&mapping->i_mmap, first_index,
3364 last_index, &details);
3365 i_mmap_unlock_write(mapping);
3366}
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380void unmap_mapping_pages(struct address_space *mapping, pgoff_t start,
3381 pgoff_t nr, bool even_cows)
3382{
3383 struct zap_details details = { };
3384 pgoff_t first_index = start;
3385 pgoff_t last_index = start + nr - 1;
3386
3387 details.zap_mapping = even_cows ? NULL : mapping;
3388 if (last_index < first_index)
3389 last_index = ULONG_MAX;
3390
3391 i_mmap_lock_write(mapping);
3392 if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
3393 unmap_mapping_range_tree(&mapping->i_mmap, first_index,
3394 last_index, &details);
3395 i_mmap_unlock_write(mapping);
3396}
3397EXPORT_SYMBOL_GPL(unmap_mapping_pages);
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416void unmap_mapping_range(struct address_space *mapping,
3417 loff_t const holebegin, loff_t const holelen, int even_cows)
3418{
3419 pgoff_t hba = holebegin >> PAGE_SHIFT;
3420 pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
3421
3422
3423 if (sizeof(holelen) > sizeof(hlen)) {
3424 long long holeend =
3425 (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
3426 if (holeend & ~(long long)ULONG_MAX)
3427 hlen = ULONG_MAX - hba + 1;
3428 }
3429
3430 unmap_mapping_pages(mapping, hba, hlen, even_cows);
3431}
3432EXPORT_SYMBOL(unmap_mapping_range);
3433
3434
3435
3436
3437static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
3438{
3439 struct page *page = vmf->page;
3440 struct vm_area_struct *vma = vmf->vma;
3441 struct mmu_notifier_range range;
3442
3443 if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags))
3444 return VM_FAULT_RETRY;
3445 mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma,
3446 vma->vm_mm, vmf->address & PAGE_MASK,
3447 (vmf->address & PAGE_MASK) + PAGE_SIZE, NULL);
3448 mmu_notifier_invalidate_range_start(&range);
3449
3450 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
3451 &vmf->ptl);
3452 if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
3453 restore_exclusive_pte(vma, page, vmf->address, vmf->pte);
3454
3455 pte_unmap_unlock(vmf->pte, vmf->ptl);
3456 unlock_page(page);
3457
3458 mmu_notifier_invalidate_range_end(&range);
3459 return 0;
3460}
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470vm_fault_t do_swap_page(struct vm_fault *vmf)
3471{
3472 struct vm_area_struct *vma = vmf->vma;
3473 struct page *page = NULL, *swapcache;
3474 struct swap_info_struct *si = NULL;
3475 swp_entry_t entry;
3476 pte_t pte;
3477 int locked;
3478 int exclusive = 0;
3479 vm_fault_t ret = 0;
3480 void *shadow = NULL;
3481
3482 if (!pte_unmap_same(vmf))
3483 goto out;
3484
3485 entry = pte_to_swp_entry(vmf->orig_pte);
3486 if (unlikely(non_swap_entry(entry))) {
3487 if (is_migration_entry(entry)) {
3488 migration_entry_wait(vma->vm_mm, vmf->pmd,
3489 vmf->address);
3490 } else if (is_device_exclusive_entry(entry)) {
3491 vmf->page = pfn_swap_entry_to_page(entry);
3492 ret = remove_device_exclusive_entry(vmf);
3493 } else if (is_device_private_entry(entry)) {
3494 vmf->page = pfn_swap_entry_to_page(entry);
3495 ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
3496 } else if (is_hwpoison_entry(entry)) {
3497 ret = VM_FAULT_HWPOISON;
3498 } else {
3499 print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
3500 ret = VM_FAULT_SIGBUS;
3501 }
3502 goto out;
3503 }
3504
3505
3506 si = get_swap_device(entry);
3507 if (unlikely(!si))
3508 goto out;
3509
3510 delayacct_set_flag(current, DELAYACCT_PF_SWAPIN);
3511 page = lookup_swap_cache(entry, vma, vmf->address);
3512 swapcache = page;
3513
3514 if (!page) {
3515 if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
3516 __swap_count(entry) == 1) {
3517
3518 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
3519 vmf->address);
3520 if (page) {
3521 __SetPageLocked(page);
3522 __SetPageSwapBacked(page);
3523
3524 if (mem_cgroup_swapin_charge_page(page,
3525 vma->vm_mm, GFP_KERNEL, entry)) {
3526 ret = VM_FAULT_OOM;
3527 goto out_page;
3528 }
3529 mem_cgroup_swapin_uncharge_swap(entry);
3530
3531 shadow = get_shadow_from_swap_cache(entry);
3532 if (shadow)
3533 workingset_refault(page_folio(page),
3534 shadow);
3535
3536 lru_cache_add(page);
3537
3538
3539 set_page_private(page, entry.val);
3540 swap_readpage(page, true);
3541 set_page_private(page, 0);
3542 }
3543 } else {
3544 page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
3545 vmf);
3546 swapcache = page;
3547 }
3548
3549 if (!page) {
3550
3551
3552
3553
3554 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
3555 vmf->address, &vmf->ptl);
3556 if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
3557 ret = VM_FAULT_OOM;
3558 delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN);
3559 goto unlock;
3560 }
3561
3562
3563 ret = VM_FAULT_MAJOR;
3564 count_vm_event(PGMAJFAULT);
3565 count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
3566 } else if (PageHWPoison(page)) {
3567
3568
3569
3570
3571 ret = VM_FAULT_HWPOISON;
3572 delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN);
3573 goto out_release;
3574 }
3575
3576 locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
3577
3578 delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN);
3579 if (!locked) {
3580 ret |= VM_FAULT_RETRY;
3581 goto out_release;
3582 }
3583
3584
3585
3586
3587
3588
3589
3590 if (unlikely((!PageSwapCache(page) ||
3591 page_private(page) != entry.val)) && swapcache)
3592 goto out_page;
3593
3594 page = ksm_might_need_to_copy(page, vma, vmf->address);
3595 if (unlikely(!page)) {
3596 ret = VM_FAULT_OOM;
3597 page = swapcache;
3598 goto out_page;
3599 }
3600
3601 cgroup_throttle_swaprate(page, GFP_KERNEL);
3602
3603
3604
3605
3606 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
3607 &vmf->ptl);
3608 if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte)))
3609 goto out_nomap;
3610
3611 if (unlikely(!PageUptodate(page))) {
3612 ret = VM_FAULT_SIGBUS;
3613 goto out_nomap;
3614 }
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
3627 dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
3628 pte = mk_pte(page, vma->vm_page_prot);
3629 if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
3630 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
3631 vmf->flags &= ~FAULT_FLAG_WRITE;
3632 ret |= VM_FAULT_WRITE;
3633 exclusive = RMAP_EXCLUSIVE;
3634 }
3635 flush_icache_page(vma, page);
3636 if (pte_swp_soft_dirty(vmf->orig_pte))
3637 pte = pte_mksoft_dirty(pte);
3638 if (pte_swp_uffd_wp(vmf->orig_pte)) {
3639 pte = pte_mkuffd_wp(pte);
3640 pte = pte_wrprotect(pte);
3641 }
3642 set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
3643 arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
3644 vmf->orig_pte = pte;
3645
3646
3647 if (unlikely(page != swapcache && swapcache)) {
3648 page_add_new_anon_rmap(page, vma, vmf->address, false);
3649 lru_cache_add_inactive_or_unevictable(page, vma);
3650 } else {
3651 do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
3652 }
3653
3654 swap_free(entry);
3655 if (mem_cgroup_swap_full(page) ||
3656 (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
3657 try_to_free_swap(page);
3658 unlock_page(page);
3659 if (page != swapcache && swapcache) {
3660
3661
3662
3663
3664
3665
3666
3667
3668 unlock_page(swapcache);
3669 put_page(swapcache);
3670 }
3671
3672 if (vmf->flags & FAULT_FLAG_WRITE) {
3673 ret |= do_wp_page(vmf);
3674 if (ret & VM_FAULT_ERROR)
3675 ret &= VM_FAULT_ERROR;
3676 goto out;
3677 }
3678
3679
3680 update_mmu_cache(vma, vmf->address, vmf->pte);
3681unlock:
3682 pte_unmap_unlock(vmf->pte, vmf->ptl);
3683out:
3684 if (si)
3685 put_swap_device(si);
3686 return ret;
3687out_nomap:
3688 pte_unmap_unlock(vmf->pte, vmf->ptl);
3689out_page:
3690 unlock_page(page);
3691out_release:
3692 put_page(page);
3693 if (page != swapcache && swapcache) {
3694 unlock_page(swapcache);
3695 put_page(swapcache);
3696 }
3697 if (si)
3698 put_swap_device(si);
3699 return ret;
3700}
3701
3702
3703
3704
3705
3706
3707static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
3708{
3709 struct vm_area_struct *vma = vmf->vma;
3710 struct page *page;
3711 vm_fault_t ret = 0;
3712 pte_t entry;
3713
3714
3715 if (vma->vm_flags & VM_SHARED)
3716 return VM_FAULT_SIGBUS;
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728 if (pte_alloc(vma->vm_mm, vmf->pmd))
3729 return VM_FAULT_OOM;
3730
3731
3732 if (unlikely(pmd_trans_unstable(vmf->pmd)))
3733 return 0;
3734
3735
3736 if (!(vmf->flags & FAULT_FLAG_WRITE) &&
3737 !mm_forbids_zeropage(vma->vm_mm)) {
3738 entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address),
3739 vma->vm_page_prot));
3740 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
3741 vmf->address, &vmf->ptl);
3742 if (!pte_none(*vmf->pte)) {
3743 update_mmu_tlb(vma, vmf->address, vmf->pte);
3744 goto unlock;
3745 }
3746 ret = check_stable_address_space(vma->vm_mm);
3747 if (ret)
3748 goto unlock;
3749
3750 if (userfaultfd_missing(vma)) {
3751 pte_unmap_unlock(vmf->pte, vmf->ptl);
3752 return handle_userfault(vmf, VM_UFFD_MISSING);
3753 }
3754 goto setpte;
3755 }
3756
3757
3758 if (unlikely(anon_vma_prepare(vma)))
3759 goto oom;
3760 page = alloc_zeroed_user_highpage_movable(vma, vmf->address);
3761 if (!page)
3762 goto oom;
3763
3764 if (mem_cgroup_charge(page_folio(page), vma->vm_mm, GFP_KERNEL))
3765 goto oom_free_page;
3766 cgroup_throttle_swaprate(page, GFP_KERNEL);
3767
3768
3769
3770
3771
3772
3773 __SetPageUptodate(page);
3774
3775 entry = mk_pte(page, vma->vm_page_prot);
3776 entry = pte_sw_mkyoung(entry);
3777 if (vma->vm_flags & VM_WRITE)
3778 entry = pte_mkwrite(pte_mkdirty(entry));
3779
3780 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
3781 &vmf->ptl);
3782 if (!pte_none(*vmf->pte)) {
3783 update_mmu_cache(vma, vmf->address, vmf->pte);
3784 goto release;
3785 }
3786
3787 ret = check_stable_address_space(vma->vm_mm);
3788 if (ret)
3789 goto release;
3790
3791
3792 if (userfaultfd_missing(vma)) {
3793 pte_unmap_unlock(vmf->pte, vmf->ptl);
3794 put_page(page);
3795 return handle_userfault(vmf, VM_UFFD_MISSING);
3796 }
3797
3798 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
3799 page_add_new_anon_rmap(page, vma, vmf->address, false);
3800 lru_cache_add_inactive_or_unevictable(page, vma);
3801setpte:
3802 set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
3803
3804
3805 update_mmu_cache(vma, vmf->address, vmf->pte);
3806unlock:
3807 pte_unmap_unlock(vmf->pte, vmf->ptl);
3808 return ret;
3809release:
3810 put_page(page);
3811 goto unlock;
3812oom_free_page:
3813 put_page(page);
3814oom:
3815 return VM_FAULT_OOM;
3816}
3817
3818
3819
3820
3821
3822
3823static vm_fault_t __do_fault(struct vm_fault *vmf)
3824{
3825 struct vm_area_struct *vma = vmf->vma;
3826 vm_fault_t ret;
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843 if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) {
3844 vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
3845 if (!vmf->prealloc_pte)
3846 return VM_FAULT_OOM;
3847 }
3848
3849 ret = vma->vm_ops->fault(vmf);
3850 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
3851 VM_FAULT_DONE_COW)))
3852 return ret;
3853
3854 if (unlikely(PageHWPoison(vmf->page))) {
3855 if (ret & VM_FAULT_LOCKED)
3856 unlock_page(vmf->page);
3857 put_page(vmf->page);
3858 vmf->page = NULL;
3859 return VM_FAULT_HWPOISON;
3860 }
3861
3862 if (unlikely(!(ret & VM_FAULT_LOCKED)))
3863 lock_page(vmf->page);
3864 else
3865 VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page);
3866
3867 return ret;
3868}
3869
3870#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3871static void deposit_prealloc_pte(struct vm_fault *vmf)
3872{
3873 struct vm_area_struct *vma = vmf->vma;
3874
3875 pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
3876
3877
3878
3879
3880 mm_inc_nr_ptes(vma->vm_mm);
3881 vmf->prealloc_pte = NULL;
3882}
3883
3884vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
3885{
3886 struct vm_area_struct *vma = vmf->vma;
3887 bool write = vmf->flags & FAULT_FLAG_WRITE;
3888 unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
3889 pmd_t entry;
3890 int i;
3891 vm_fault_t ret = VM_FAULT_FALLBACK;
3892
3893 if (!transhuge_vma_suitable(vma, haddr))
3894 return ret;
3895
3896 page = compound_head(page);
3897 if (compound_order(page) != HPAGE_PMD_ORDER)
3898 return ret;
3899
3900
3901
3902
3903
3904
3905
3906 if (unlikely(PageHasHWPoisoned(page)))
3907 return ret;
3908
3909
3910
3911
3912
3913 if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
3914 vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
3915 if (!vmf->prealloc_pte)
3916 return VM_FAULT_OOM;
3917 }
3918
3919 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
3920 if (unlikely(!pmd_none(*vmf->pmd)))
3921 goto out;
3922
3923 for (i = 0; i < HPAGE_PMD_NR; i++)
3924 flush_icache_page(vma, page + i);
3925
3926 entry = mk_huge_pmd(page, vma->vm_page_prot);
3927 if (write)
3928 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
3929
3930 add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR);
3931 page_add_file_rmap(page, true);
3932
3933
3934
3935 if (arch_needs_pgtable_deposit())
3936 deposit_prealloc_pte(vmf);
3937
3938 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
3939
3940 update_mmu_cache_pmd(vma, haddr, vmf->pmd);
3941
3942
3943 ret = 0;
3944 count_vm_event(THP_FILE_MAPPED);
3945out:
3946 spin_unlock(vmf->ptl);
3947 return ret;
3948}
3949#else
3950vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
3951{
3952 return VM_FAULT_FALLBACK;
3953}
3954#endif
3955
3956void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
3957{
3958 struct vm_area_struct *vma = vmf->vma;
3959 bool write = vmf->flags & FAULT_FLAG_WRITE;
3960 bool prefault = vmf->address != addr;
3961 pte_t entry;
3962
3963 flush_icache_page(vma, page);
3964 entry = mk_pte(page, vma->vm_page_prot);
3965
3966 if (prefault && arch_wants_old_prefaulted_pte())
3967 entry = pte_mkold(entry);
3968 else
3969 entry = pte_sw_mkyoung(entry);
3970
3971 if (write)
3972 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
3973
3974 if (write && !(vma->vm_flags & VM_SHARED)) {
3975 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
3976 page_add_new_anon_rmap(page, vma, addr, false);
3977 lru_cache_add_inactive_or_unevictable(page, vma);
3978 } else {
3979 inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
3980 page_add_file_rmap(page, false);
3981 }
3982 set_pte_at(vma->vm_mm, addr, vmf->pte, entry);
3983}
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000vm_fault_t finish_fault(struct vm_fault *vmf)
4001{
4002 struct vm_area_struct *vma = vmf->vma;
4003 struct page *page;
4004 vm_fault_t ret;
4005
4006
4007 if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED))
4008 page = vmf->cow_page;
4009 else
4010 page = vmf->page;
4011
4012
4013
4014
4015
4016 if (!(vma->vm_flags & VM_SHARED)) {
4017 ret = check_stable_address_space(vma->vm_mm);
4018 if (ret)
4019 return ret;
4020 }
4021
4022 if (pmd_none(*vmf->pmd)) {
4023 if (PageTransCompound(page)) {
4024 ret = do_set_pmd(vmf, page);
4025 if (ret != VM_FAULT_FALLBACK)
4026 return ret;
4027 }
4028
4029 if (vmf->prealloc_pte)
4030 pmd_install(vma->vm_mm, vmf->pmd, &vmf->prealloc_pte);
4031 else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd)))
4032 return VM_FAULT_OOM;
4033 }
4034
4035
4036 if (pmd_devmap_trans_unstable(vmf->pmd))
4037 return 0;
4038
4039 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
4040 vmf->address, &vmf->ptl);
4041 ret = 0;
4042
4043 if (likely(pte_none(*vmf->pte)))
4044 do_set_pte(vmf, page, vmf->address);
4045 else
4046 ret = VM_FAULT_NOPAGE;
4047
4048 update_mmu_tlb(vma, vmf->address, vmf->pte);
4049 pte_unmap_unlock(vmf->pte, vmf->ptl);
4050 return ret;
4051}
4052
4053static unsigned long fault_around_bytes __read_mostly =
4054 rounddown_pow_of_two(65536);
4055
4056#ifdef CONFIG_DEBUG_FS
4057static int fault_around_bytes_get(void *data, u64 *val)
4058{
4059 *val = fault_around_bytes;
4060 return 0;
4061}
4062
4063
4064
4065
4066
4067static int fault_around_bytes_set(void *data, u64 val)
4068{
4069 if (val / PAGE_SIZE > PTRS_PER_PTE)
4070 return -EINVAL;
4071 if (val > PAGE_SIZE)
4072 fault_around_bytes = rounddown_pow_of_two(val);
4073 else
4074 fault_around_bytes = PAGE_SIZE;
4075 return 0;
4076}
4077DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops,
4078 fault_around_bytes_get, fault_around_bytes_set, "%llu\n");
4079
4080static int __init fault_around_debugfs(void)
4081{
4082 debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL,
4083 &fault_around_bytes_fops);
4084 return 0;
4085}
4086late_initcall(fault_around_debugfs);
4087#endif
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113static vm_fault_t do_fault_around(struct vm_fault *vmf)
4114{
4115 unsigned long address = vmf->address, nr_pages, mask;
4116 pgoff_t start_pgoff = vmf->pgoff;
4117 pgoff_t end_pgoff;
4118 int off;
4119
4120 nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
4121 mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
4122
4123 address = max(address & mask, vmf->vma->vm_start);
4124 off = ((vmf->address - address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
4125 start_pgoff -= off;
4126
4127
4128
4129
4130
4131 end_pgoff = start_pgoff -
4132 ((address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
4133 PTRS_PER_PTE - 1;
4134 end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1,
4135 start_pgoff + nr_pages - 1);
4136
4137 if (pmd_none(*vmf->pmd)) {
4138 vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
4139 if (!vmf->prealloc_pte)
4140 return VM_FAULT_OOM;
4141 }
4142
4143 return vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);
4144}
4145
4146static vm_fault_t do_read_fault(struct vm_fault *vmf)
4147{
4148 struct vm_area_struct *vma = vmf->vma;
4149 vm_fault_t ret = 0;
4150
4151
4152
4153
4154
4155
4156 if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
4157 if (likely(!userfaultfd_minor(vmf->vma))) {
4158 ret = do_fault_around(vmf);
4159 if (ret)
4160 return ret;
4161 }
4162 }
4163
4164 ret = __do_fault(vmf);
4165 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
4166 return ret;
4167
4168 ret |= finish_fault(vmf);
4169 unlock_page(vmf->page);
4170 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
4171 put_page(vmf->page);
4172 return ret;
4173}
4174
4175static vm_fault_t do_cow_fault(struct vm_fault *vmf)
4176{
4177 struct vm_area_struct *vma = vmf->vma;
4178 vm_fault_t ret;
4179
4180 if (unlikely(anon_vma_prepare(vma)))
4181 return VM_FAULT_OOM;
4182
4183 vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address);
4184 if (!vmf->cow_page)
4185 return VM_FAULT_OOM;
4186
4187 if (mem_cgroup_charge(page_folio(vmf->cow_page), vma->vm_mm,
4188 GFP_KERNEL)) {
4189 put_page(vmf->cow_page);
4190 return VM_FAULT_OOM;
4191 }
4192 cgroup_throttle_swaprate(vmf->cow_page, GFP_KERNEL);
4193
4194 ret = __do_fault(vmf);
4195 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
4196 goto uncharge_out;
4197 if (ret & VM_FAULT_DONE_COW)
4198 return ret;
4199
4200 copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma);
4201 __SetPageUptodate(vmf->cow_page);
4202
4203 ret |= finish_fault(vmf);
4204 unlock_page(vmf->page);
4205 put_page(vmf->page);
4206 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
4207 goto uncharge_out;
4208 return ret;
4209uncharge_out:
4210 put_page(vmf->cow_page);
4211 return ret;
4212}
4213
4214static vm_fault_t do_shared_fault(struct vm_fault *vmf)
4215{
4216 struct vm_area_struct *vma = vmf->vma;
4217 vm_fault_t ret, tmp;
4218
4219 ret = __do_fault(vmf);
4220 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
4221 return ret;
4222
4223
4224
4225
4226
4227 if (vma->vm_ops->page_mkwrite) {
4228 unlock_page(vmf->page);
4229 tmp = do_page_mkwrite(vmf);
4230 if (unlikely(!tmp ||
4231 (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
4232 put_page(vmf->page);
4233 return tmp;
4234 }
4235 }
4236
4237 ret |= finish_fault(vmf);
4238 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
4239 VM_FAULT_RETRY))) {
4240 unlock_page(vmf->page);
4241 put_page(vmf->page);
4242 return ret;
4243 }
4244
4245 ret |= fault_dirty_shared_page(vmf);
4246 return ret;
4247}
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257static vm_fault_t do_fault(struct vm_fault *vmf)
4258{
4259 struct vm_area_struct *vma = vmf->vma;
4260 struct mm_struct *vm_mm = vma->vm_mm;
4261 vm_fault_t ret;
4262
4263
4264
4265
4266 if (!vma->vm_ops->fault) {
4267
4268
4269
4270
4271 if (unlikely(!pmd_present(*vmf->pmd)))
4272 ret = VM_FAULT_SIGBUS;
4273 else {
4274 vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm,
4275 vmf->pmd,
4276 vmf->address,
4277 &vmf->ptl);
4278
4279
4280
4281
4282
4283
4284
4285 if (unlikely(pte_none(*vmf->pte)))
4286 ret = VM_FAULT_SIGBUS;
4287 else
4288 ret = VM_FAULT_NOPAGE;
4289
4290 pte_unmap_unlock(vmf->pte, vmf->ptl);
4291 }
4292 } else if (!(vmf->flags & FAULT_FLAG_WRITE))
4293 ret = do_read_fault(vmf);
4294 else if (!(vma->vm_flags & VM_SHARED))
4295 ret = do_cow_fault(vmf);
4296 else
4297 ret = do_shared_fault(vmf);
4298
4299
4300 if (vmf->prealloc_pte) {
4301 pte_free(vm_mm, vmf->prealloc_pte);
4302 vmf->prealloc_pte = NULL;
4303 }
4304 return ret;
4305}
4306
4307int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
4308 unsigned long addr, int page_nid, int *flags)
4309{
4310 get_page(page);
4311
4312 count_vm_numa_event(NUMA_HINT_FAULTS);
4313 if (page_nid == numa_node_id()) {
4314 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
4315 *flags |= TNF_FAULT_LOCAL;
4316 }
4317
4318 return mpol_misplaced(page, vma, addr);
4319}
4320
4321static vm_fault_t do_numa_page(struct vm_fault *vmf)
4322{
4323 struct vm_area_struct *vma = vmf->vma;
4324 struct page *page = NULL;
4325 int page_nid = NUMA_NO_NODE;
4326 int last_cpupid;
4327 int target_nid;
4328 pte_t pte, old_pte;
4329 bool was_writable = pte_savedwrite(vmf->orig_pte);
4330 int flags = 0;
4331
4332
4333
4334
4335
4336
4337 vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd);
4338 spin_lock(vmf->ptl);
4339 if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
4340 pte_unmap_unlock(vmf->pte, vmf->ptl);
4341 goto out;
4342 }
4343
4344
4345 old_pte = ptep_get(vmf->pte);
4346 pte = pte_modify(old_pte, vma->vm_page_prot);
4347
4348 page = vm_normal_page(vma, vmf->address, pte);
4349 if (!page)
4350 goto out_map;
4351
4352
4353 if (PageCompound(page))
4354 goto out_map;
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364 if (!was_writable)
4365 flags |= TNF_NO_GROUP;
4366
4367
4368
4369
4370
4371 if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
4372 flags |= TNF_SHARED;
4373
4374 last_cpupid = page_cpupid_last(page);
4375 page_nid = page_to_nid(page);
4376 target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
4377 &flags);
4378 if (target_nid == NUMA_NO_NODE) {
4379 put_page(page);
4380 goto out_map;
4381 }
4382 pte_unmap_unlock(vmf->pte, vmf->ptl);
4383
4384
4385 if (migrate_misplaced_page(page, vma, target_nid)) {
4386 page_nid = target_nid;
4387 flags |= TNF_MIGRATED;
4388 } else {
4389 flags |= TNF_MIGRATE_FAIL;
4390 vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
4391 spin_lock(vmf->ptl);
4392 if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
4393 pte_unmap_unlock(vmf->pte, vmf->ptl);
4394 goto out;
4395 }
4396 goto out_map;
4397 }
4398
4399out:
4400 if (page_nid != NUMA_NO_NODE)
4401 task_numa_fault(last_cpupid, page_nid, 1, flags);
4402 return 0;
4403out_map:
4404
4405
4406
4407
4408 old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte);
4409 pte = pte_modify(old_pte, vma->vm_page_prot);
4410 pte = pte_mkyoung(pte);
4411 if (was_writable)
4412 pte = pte_mkwrite(pte);
4413 ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte);
4414 update_mmu_cache(vma, vmf->address, vmf->pte);
4415 pte_unmap_unlock(vmf->pte, vmf->ptl);
4416 goto out;
4417}
4418
4419static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
4420{
4421 if (vma_is_anonymous(vmf->vma))
4422 return do_huge_pmd_anonymous_page(vmf);
4423 if (vmf->vma->vm_ops->huge_fault)
4424 return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
4425 return VM_FAULT_FALLBACK;
4426}
4427
4428
4429static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
4430{
4431 if (vma_is_anonymous(vmf->vma)) {
4432 if (userfaultfd_huge_pmd_wp(vmf->vma, vmf->orig_pmd))
4433 return handle_userfault(vmf, VM_UFFD_WP);
4434 return do_huge_pmd_wp_page(vmf);
4435 }
4436 if (vmf->vma->vm_ops->huge_fault) {
4437 vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
4438
4439 if (!(ret & VM_FAULT_FALLBACK))
4440 return ret;
4441 }
4442
4443
4444 __split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);
4445
4446 return VM_FAULT_FALLBACK;
4447}
4448
4449static vm_fault_t create_huge_pud(struct vm_fault *vmf)
4450{
4451#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
4452 defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
4453
4454 if (vma_is_anonymous(vmf->vma))
4455 goto split;
4456 if (vmf->vma->vm_ops->huge_fault) {
4457 vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
4458
4459 if (!(ret & VM_FAULT_FALLBACK))
4460 return ret;
4461 }
4462split:
4463
4464 __split_huge_pud(vmf->vma, vmf->pud, vmf->address);
4465#endif
4466 return VM_FAULT_FALLBACK;
4467}
4468
4469static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
4470{
4471#ifdef CONFIG_TRANSPARENT_HUGEPAGE
4472
4473 if (vma_is_anonymous(vmf->vma))
4474 return VM_FAULT_FALLBACK;
4475 if (vmf->vma->vm_ops->huge_fault)
4476 return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
4477#endif
4478 return VM_FAULT_FALLBACK;
4479}
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
4497{
4498 pte_t entry;
4499
4500 if (unlikely(pmd_none(*vmf->pmd))) {
4501
4502
4503
4504
4505
4506
4507 vmf->pte = NULL;
4508 } else {
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521 if (pmd_devmap_trans_unstable(vmf->pmd))
4522 return 0;
4523
4524
4525
4526
4527
4528
4529 vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
4530 vmf->orig_pte = *vmf->pte;
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540 barrier();
4541 if (pte_none(vmf->orig_pte)) {
4542 pte_unmap(vmf->pte);
4543 vmf->pte = NULL;
4544 }
4545 }
4546
4547 if (!vmf->pte) {
4548 if (vma_is_anonymous(vmf->vma))
4549 return do_anonymous_page(vmf);
4550 else
4551 return do_fault(vmf);
4552 }
4553
4554 if (!pte_present(vmf->orig_pte))
4555 return do_swap_page(vmf);
4556
4557 if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
4558 return do_numa_page(vmf);
4559
4560 vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
4561 spin_lock(vmf->ptl);
4562 entry = vmf->orig_pte;
4563 if (unlikely(!pte_same(*vmf->pte, entry))) {
4564 update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
4565 goto unlock;
4566 }
4567 if (vmf->flags & FAULT_FLAG_WRITE) {
4568 if (!pte_write(entry))
4569 return do_wp_page(vmf);
4570 entry = pte_mkdirty(entry);
4571 }
4572 entry = pte_mkyoung(entry);
4573 if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
4574 vmf->flags & FAULT_FLAG_WRITE)) {
4575 update_mmu_cache(vmf->vma, vmf->address, vmf->pte);
4576 } else {
4577
4578 if (vmf->flags & FAULT_FLAG_TRIED)
4579 goto unlock;
4580
4581
4582
4583
4584
4585
4586 if (vmf->flags & FAULT_FLAG_WRITE)
4587 flush_tlb_fix_spurious_fault(vmf->vma, vmf->address);
4588 }
4589unlock:
4590 pte_unmap_unlock(vmf->pte, vmf->ptl);
4591 return 0;
4592}
4593
4594
4595
4596
4597
4598
4599
4600static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
4601 unsigned long address, unsigned int flags)
4602{
4603 struct vm_fault vmf = {
4604 .vma = vma,
4605 .address = address & PAGE_MASK,
4606 .flags = flags,
4607 .pgoff = linear_page_index(vma, address),
4608 .gfp_mask = __get_fault_gfp_mask(vma),
4609 };
4610 unsigned int dirty = flags & FAULT_FLAG_WRITE;
4611 struct mm_struct *mm = vma->vm_mm;
4612 pgd_t *pgd;
4613 p4d_t *p4d;
4614 vm_fault_t ret;
4615
4616 pgd = pgd_offset(mm, address);
4617 p4d = p4d_alloc(mm, pgd, address);
4618 if (!p4d)
4619 return VM_FAULT_OOM;
4620
4621 vmf.pud = pud_alloc(mm, p4d, address);
4622 if (!vmf.pud)
4623 return VM_FAULT_OOM;
4624retry_pud:
4625 if (pud_none(*vmf.pud) && __transparent_hugepage_enabled(vma)) {
4626 ret = create_huge_pud(&vmf);
4627 if (!(ret & VM_FAULT_FALLBACK))
4628 return ret;
4629 } else {
4630 pud_t orig_pud = *vmf.pud;
4631
4632 barrier();
4633 if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {
4634
4635
4636
4637 if (dirty && !pud_write(orig_pud)) {
4638 ret = wp_huge_pud(&vmf, orig_pud);
4639 if (!(ret & VM_FAULT_FALLBACK))
4640 return ret;
4641 } else {
4642 huge_pud_set_accessed(&vmf, orig_pud);
4643 return 0;
4644 }
4645 }
4646 }
4647
4648 vmf.pmd = pmd_alloc(mm, vmf.pud, address);
4649 if (!vmf.pmd)
4650 return VM_FAULT_OOM;
4651
4652
4653 if (pud_trans_unstable(vmf.pud))
4654 goto retry_pud;
4655
4656 if (pmd_none(*vmf.pmd) && __transparent_hugepage_enabled(vma)) {
4657 ret = create_huge_pmd(&vmf);
4658 if (!(ret & VM_FAULT_FALLBACK))
4659 return ret;
4660 } else {
4661 vmf.orig_pmd = *vmf.pmd;
4662
4663 barrier();
4664 if (unlikely(is_swap_pmd(vmf.orig_pmd))) {
4665 VM_BUG_ON(thp_migration_supported() &&
4666 !is_pmd_migration_entry(vmf.orig_pmd));
4667 if (is_pmd_migration_entry(vmf.orig_pmd))
4668 pmd_migration_entry_wait(mm, vmf.pmd);
4669 return 0;
4670 }
4671 if (pmd_trans_huge(vmf.orig_pmd) || pmd_devmap(vmf.orig_pmd)) {
4672 if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma))
4673 return do_huge_pmd_numa_page(&vmf);
4674
4675 if (dirty && !pmd_write(vmf.orig_pmd)) {
4676 ret = wp_huge_pmd(&vmf);
4677 if (!(ret & VM_FAULT_FALLBACK))
4678 return ret;
4679 } else {
4680 huge_pmd_set_accessed(&vmf);
4681 return 0;
4682 }
4683 }
4684 }
4685
4686 return handle_pte_fault(&vmf);
4687}
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704static inline void mm_account_fault(struct pt_regs *regs,
4705 unsigned long address, unsigned int flags,
4706 vm_fault_t ret)
4707{
4708 bool major;
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721 if (ret & (VM_FAULT_ERROR | VM_FAULT_RETRY))
4722 return;
4723
4724
4725
4726
4727
4728
4729 major = (ret & VM_FAULT_MAJOR) || (flags & FAULT_FLAG_TRIED);
4730
4731 if (major)
4732 current->maj_flt++;
4733 else
4734 current->min_flt++;
4735
4736
4737
4738
4739
4740
4741 if (!regs)
4742 return;
4743
4744 if (major)
4745 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
4746 else
4747 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
4748}
4749
4750
4751
4752
4753
4754
4755
4756vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
4757 unsigned int flags, struct pt_regs *regs)
4758{
4759 vm_fault_t ret;
4760
4761 __set_current_state(TASK_RUNNING);
4762
4763 count_vm_event(PGFAULT);
4764 count_memcg_event_mm(vma->vm_mm, PGFAULT);
4765
4766
4767 check_sync_rss_stat(current);
4768
4769 if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
4770 flags & FAULT_FLAG_INSTRUCTION,
4771 flags & FAULT_FLAG_REMOTE))
4772 return VM_FAULT_SIGSEGV;
4773
4774
4775
4776
4777
4778 if (flags & FAULT_FLAG_USER)
4779 mem_cgroup_enter_user_fault();
4780
4781 if (unlikely(is_vm_hugetlb_page(vma)))
4782 ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
4783 else
4784 ret = __handle_mm_fault(vma, address, flags);
4785
4786 if (flags & FAULT_FLAG_USER) {
4787 mem_cgroup_exit_user_fault();
4788
4789
4790
4791
4792
4793
4794 if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
4795 mem_cgroup_oom_synchronize(false);
4796 }
4797
4798 mm_account_fault(regs, address, flags, ret);
4799
4800 return ret;
4801}
4802EXPORT_SYMBOL_GPL(handle_mm_fault);
4803
4804#ifndef __PAGETABLE_P4D_FOLDED
4805
4806
4807
4808
4809int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
4810{
4811 p4d_t *new = p4d_alloc_one(mm, address);
4812 if (!new)
4813 return -ENOMEM;
4814
4815 spin_lock(&mm->page_table_lock);
4816 if (pgd_present(*pgd)) {
4817 p4d_free(mm, new);
4818 } else {
4819 smp_wmb();
4820 pgd_populate(mm, pgd, new);
4821 }
4822 spin_unlock(&mm->page_table_lock);
4823 return 0;
4824}
4825#endif
4826
4827#ifndef __PAGETABLE_PUD_FOLDED
4828
4829
4830
4831
4832int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address)
4833{
4834 pud_t *new = pud_alloc_one(mm, address);
4835 if (!new)
4836 return -ENOMEM;
4837
4838 spin_lock(&mm->page_table_lock);
4839 if (!p4d_present(*p4d)) {
4840 mm_inc_nr_puds(mm);
4841 smp_wmb();
4842 p4d_populate(mm, p4d, new);
4843 } else
4844 pud_free(mm, new);
4845 spin_unlock(&mm->page_table_lock);
4846 return 0;
4847}
4848#endif
4849
4850#ifndef __PAGETABLE_PMD_FOLDED
4851
4852
4853
4854
4855int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
4856{
4857 spinlock_t *ptl;
4858 pmd_t *new = pmd_alloc_one(mm, address);
4859 if (!new)
4860 return -ENOMEM;
4861
4862 ptl = pud_lock(mm, pud);
4863 if (!pud_present(*pud)) {
4864 mm_inc_nr_pmds(mm);
4865 smp_wmb();
4866 pud_populate(mm, pud, new);
4867 } else {
4868 pmd_free(mm, new);
4869 }
4870 spin_unlock(ptl);
4871 return 0;
4872}
4873#endif
4874
4875int follow_invalidate_pte(struct mm_struct *mm, unsigned long address,
4876 struct mmu_notifier_range *range, pte_t **ptepp,
4877 pmd_t **pmdpp, spinlock_t **ptlp)
4878{
4879 pgd_t *pgd;
4880 p4d_t *p4d;
4881 pud_t *pud;
4882 pmd_t *pmd;
4883 pte_t *ptep;
4884
4885 pgd = pgd_offset(mm, address);
4886 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
4887 goto out;
4888
4889 p4d = p4d_offset(pgd, address);
4890 if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d)))
4891 goto out;
4892
4893 pud = pud_offset(p4d, address);
4894 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
4895 goto out;
4896
4897 pmd = pmd_offset(pud, address);
4898 VM_BUG_ON(pmd_trans_huge(*pmd));
4899
4900 if (pmd_huge(*pmd)) {
4901 if (!pmdpp)
4902 goto out;
4903
4904 if (range) {
4905 mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0,
4906 NULL, mm, address & PMD_MASK,
4907 (address & PMD_MASK) + PMD_SIZE);
4908 mmu_notifier_invalidate_range_start(range);
4909 }
4910 *ptlp = pmd_lock(mm, pmd);
4911 if (pmd_huge(*pmd)) {
4912 *pmdpp = pmd;
4913 return 0;
4914 }
4915 spin_unlock(*ptlp);
4916 if (range)
4917 mmu_notifier_invalidate_range_end(range);
4918 }
4919
4920 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
4921 goto out;
4922
4923 if (range) {
4924 mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
4925 address & PAGE_MASK,
4926 (address & PAGE_MASK) + PAGE_SIZE);
4927 mmu_notifier_invalidate_range_start(range);
4928 }
4929 ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
4930 if (!pte_present(*ptep))
4931 goto unlock;
4932 *ptepp = ptep;
4933 return 0;
4934unlock:
4935 pte_unmap_unlock(ptep, *ptlp);
4936 if (range)
4937 mmu_notifier_invalidate_range_end(range);
4938out:
4939 return -EINVAL;
4940}
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963int follow_pte(struct mm_struct *mm, unsigned long address,
4964 pte_t **ptepp, spinlock_t **ptlp)
4965{
4966 return follow_invalidate_pte(mm, address, NULL, ptepp, NULL, ptlp);
4967}
4968EXPORT_SYMBOL_GPL(follow_pte);
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983int follow_pfn(struct vm_area_struct *vma, unsigned long address,
4984 unsigned long *pfn)
4985{
4986 int ret = -EINVAL;
4987 spinlock_t *ptl;
4988 pte_t *ptep;
4989
4990 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
4991 return ret;
4992
4993 ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
4994 if (ret)
4995 return ret;
4996 *pfn = pte_pfn(*ptep);
4997 pte_unmap_unlock(ptep, ptl);
4998 return 0;
4999}
5000EXPORT_SYMBOL(follow_pfn);
5001
5002#ifdef CONFIG_HAVE_IOREMAP_PROT
5003int follow_phys(struct vm_area_struct *vma,
5004 unsigned long address, unsigned int flags,
5005 unsigned long *prot, resource_size_t *phys)
5006{
5007 int ret = -EINVAL;
5008 pte_t *ptep, pte;
5009 spinlock_t *ptl;
5010
5011 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
5012 goto out;
5013
5014 if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
5015 goto out;
5016 pte = *ptep;
5017
5018 if ((flags & FOLL_WRITE) && !pte_write(pte))
5019 goto unlock;
5020
5021 *prot = pgprot_val(pte_pgprot(pte));
5022 *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
5023
5024 ret = 0;
5025unlock:
5026 pte_unmap_unlock(ptep, ptl);
5027out:
5028 return ret;
5029}
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
5044 void *buf, int len, int write)
5045{
5046 resource_size_t phys_addr;
5047 unsigned long prot = 0;
5048 void __iomem *maddr;
5049 pte_t *ptep, pte;
5050 spinlock_t *ptl;
5051 int offset = offset_in_page(addr);
5052 int ret = -EINVAL;
5053
5054 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
5055 return -EINVAL;
5056
5057retry:
5058 if (follow_pte(vma->vm_mm, addr, &ptep, &ptl))
5059 return -EINVAL;
5060 pte = *ptep;
5061 pte_unmap_unlock(ptep, ptl);
5062
5063 prot = pgprot_val(pte_pgprot(pte));
5064 phys_addr = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
5065
5066 if ((write & FOLL_WRITE) && !pte_write(pte))
5067 return -EINVAL;
5068
5069 maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
5070 if (!maddr)
5071 return -ENOMEM;
5072
5073 if (follow_pte(vma->vm_mm, addr, &ptep, &ptl))
5074 goto out_unmap;
5075
5076 if (!pte_same(pte, *ptep)) {
5077 pte_unmap_unlock(ptep, ptl);
5078 iounmap(maddr);
5079
5080 goto retry;
5081 }
5082
5083 if (write)
5084 memcpy_toio(maddr + offset, buf, len);
5085 else
5086 memcpy_fromio(buf, maddr + offset, len);
5087 ret = len;
5088 pte_unmap_unlock(ptep, ptl);
5089out_unmap:
5090 iounmap(maddr);
5091
5092 return ret;
5093}
5094EXPORT_SYMBOL_GPL(generic_access_phys);
5095#endif
5096
5097
5098
5099
5100int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf,
5101 int len, unsigned int gup_flags)
5102{
5103 struct vm_area_struct *vma;
5104 void *old_buf = buf;
5105 int write = gup_flags & FOLL_WRITE;
5106
5107 if (mmap_read_lock_killable(mm))
5108 return 0;
5109
5110
5111 while (len) {
5112 int bytes, ret, offset;
5113 void *maddr;
5114 struct page *page = NULL;
5115
5116 ret = get_user_pages_remote(mm, addr, 1,
5117 gup_flags, &page, &vma, NULL);
5118 if (ret <= 0) {
5119#ifndef CONFIG_HAVE_IOREMAP_PROT
5120 break;
5121#else
5122
5123
5124
5125
5126 vma = vma_lookup(mm, addr);
5127 if (!vma)
5128 break;
5129 if (vma->vm_ops && vma->vm_ops->access)
5130 ret = vma->vm_ops->access(vma, addr, buf,
5131 len, write);
5132 if (ret <= 0)
5133 break;
5134 bytes = ret;
5135#endif
5136 } else {
5137 bytes = len;
5138 offset = addr & (PAGE_SIZE-1);
5139 if (bytes > PAGE_SIZE-offset)
5140 bytes = PAGE_SIZE-offset;
5141
5142 maddr = kmap(page);
5143 if (write) {
5144 copy_to_user_page(vma, page, addr,
5145 maddr + offset, buf, bytes);
5146 set_page_dirty_lock(page);
5147 } else {
5148 copy_from_user_page(vma, page, addr,
5149 buf, maddr + offset, bytes);
5150 }
5151 kunmap(page);
5152 put_page(page);
5153 }
5154 len -= bytes;
5155 buf += bytes;
5156 addr += bytes;
5157 }
5158 mmap_read_unlock(mm);
5159
5160 return buf - old_buf;
5161}
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175int access_remote_vm(struct mm_struct *mm, unsigned long addr,
5176 void *buf, int len, unsigned int gup_flags)
5177{
5178 return __access_remote_vm(mm, addr, buf, len, gup_flags);
5179}
5180
5181
5182
5183
5184
5185
5186int access_process_vm(struct task_struct *tsk, unsigned long addr,
5187 void *buf, int len, unsigned int gup_flags)
5188{
5189 struct mm_struct *mm;
5190 int ret;
5191
5192 mm = get_task_mm(tsk);
5193 if (!mm)
5194 return 0;
5195
5196 ret = __access_remote_vm(mm, addr, buf, len, gup_flags);
5197
5198 mmput(mm);
5199
5200 return ret;
5201}
5202EXPORT_SYMBOL_GPL(access_process_vm);
5203
5204
5205
5206
5207void print_vma_addr(char *prefix, unsigned long ip)
5208{
5209 struct mm_struct *mm = current->mm;
5210 struct vm_area_struct *vma;
5211
5212
5213
5214
5215 if (!mmap_read_trylock(mm))
5216 return;
5217
5218 vma = find_vma(mm, ip);
5219 if (vma && vma->vm_file) {
5220 struct file *f = vma->vm_file;
5221 char *buf = (char *)__get_free_page(GFP_NOWAIT);
5222 if (buf) {
5223 char *p;
5224
5225 p = file_path(f, buf, PAGE_SIZE);
5226 if (IS_ERR(p))
5227 p = "?";
5228 printk("%s%s[%lx+%lx]", prefix, kbasename(p),
5229 vma->vm_start,
5230 vma->vm_end - vma->vm_start);
5231 free_page((unsigned long)buf);
5232 }
5233 }
5234 mmap_read_unlock(mm);
5235}
5236
5237#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
5238void __might_fault(const char *file, int line)
5239{
5240
5241
5242
5243
5244
5245
5246 if (uaccess_kernel())
5247 return;
5248 if (pagefault_disabled())
5249 return;
5250 __might_sleep(file, line);
5251#if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
5252 if (current->mm)
5253 might_lock_read(¤t->mm->mmap_lock);
5254#endif
5255}
5256EXPORT_SYMBOL(__might_fault);
5257#endif
5258
5259#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
5260
5261
5262
5263
5264
5265static inline void process_huge_page(
5266 unsigned long addr_hint, unsigned int pages_per_huge_page,
5267 void (*process_subpage)(unsigned long addr, int idx, void *arg),
5268 void *arg)
5269{
5270 int i, n, base, l;
5271 unsigned long addr = addr_hint &
5272 ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
5273
5274
5275 might_sleep();
5276 n = (addr_hint - addr) / PAGE_SIZE;
5277 if (2 * n <= pages_per_huge_page) {
5278
5279 base = 0;
5280 l = n;
5281
5282 for (i = pages_per_huge_page - 1; i >= 2 * n; i--) {
5283 cond_resched();
5284 process_subpage(addr + i * PAGE_SIZE, i, arg);
5285 }
5286 } else {
5287
5288 base = pages_per_huge_page - 2 * (pages_per_huge_page - n);
5289 l = pages_per_huge_page - n;
5290
5291 for (i = 0; i < base; i++) {
5292 cond_resched();
5293 process_subpage(addr + i * PAGE_SIZE, i, arg);
5294 }
5295 }
5296
5297
5298
5299
5300 for (i = 0; i < l; i++) {
5301 int left_idx = base + i;
5302 int right_idx = base + 2 * l - 1 - i;
5303
5304 cond_resched();
5305 process_subpage(addr + left_idx * PAGE_SIZE, left_idx, arg);
5306 cond_resched();
5307 process_subpage(addr + right_idx * PAGE_SIZE, right_idx, arg);
5308 }
5309}
5310
5311static void clear_gigantic_page(struct page *page,
5312 unsigned long addr,
5313 unsigned int pages_per_huge_page)
5314{
5315 int i;
5316 struct page *p = page;
5317
5318 might_sleep();
5319 for (i = 0; i < pages_per_huge_page;
5320 i++, p = mem_map_next(p, page, i)) {
5321 cond_resched();
5322 clear_user_highpage(p, addr + i * PAGE_SIZE);
5323 }
5324}
5325
5326static void clear_subpage(unsigned long addr, int idx, void *arg)
5327{
5328 struct page *page = arg;
5329
5330 clear_user_highpage(page + idx, addr);
5331}
5332
5333void clear_huge_page(struct page *page,
5334 unsigned long addr_hint, unsigned int pages_per_huge_page)
5335{
5336 unsigned long addr = addr_hint &
5337 ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
5338
5339 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
5340 clear_gigantic_page(page, addr, pages_per_huge_page);
5341 return;
5342 }
5343
5344 process_huge_page(addr_hint, pages_per_huge_page, clear_subpage, page);
5345}
5346
5347static void copy_user_gigantic_page(struct page *dst, struct page *src,
5348 unsigned long addr,
5349 struct vm_area_struct *vma,
5350 unsigned int pages_per_huge_page)
5351{
5352 int i;
5353 struct page *dst_base = dst;
5354 struct page *src_base = src;
5355
5356 for (i = 0; i < pages_per_huge_page; ) {
5357 cond_resched();
5358 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
5359
5360 i++;
5361 dst = mem_map_next(dst, dst_base, i);
5362 src = mem_map_next(src, src_base, i);
5363 }
5364}
5365
5366struct copy_subpage_arg {
5367 struct page *dst;
5368 struct page *src;
5369 struct vm_area_struct *vma;
5370};
5371
5372static void copy_subpage(unsigned long addr, int idx, void *arg)
5373{
5374 struct copy_subpage_arg *copy_arg = arg;
5375
5376 copy_user_highpage(copy_arg->dst + idx, copy_arg->src + idx,
5377 addr, copy_arg->vma);
5378}
5379
5380void copy_user_huge_page(struct page *dst, struct page *src,
5381 unsigned long addr_hint, struct vm_area_struct *vma,
5382 unsigned int pages_per_huge_page)
5383{
5384 unsigned long addr = addr_hint &
5385 ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
5386 struct copy_subpage_arg arg = {
5387 .dst = dst,
5388 .src = src,
5389 .vma = vma,
5390 };
5391
5392 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
5393 copy_user_gigantic_page(dst, src, addr, vma,
5394 pages_per_huge_page);
5395 return;
5396 }
5397
5398 process_huge_page(addr_hint, pages_per_huge_page, copy_subpage, &arg);
5399}
5400
5401long copy_huge_page_from_user(struct page *dst_page,
5402 const void __user *usr_src,
5403 unsigned int pages_per_huge_page,
5404 bool allow_pagefault)
5405{
5406 void *page_kaddr;
5407 unsigned long i, rc = 0;
5408 unsigned long ret_val = pages_per_huge_page * PAGE_SIZE;
5409 struct page *subpage = dst_page;
5410
5411 for (i = 0; i < pages_per_huge_page;
5412 i++, subpage = mem_map_next(subpage, dst_page, i)) {
5413 if (allow_pagefault)
5414 page_kaddr = kmap(subpage);
5415 else
5416 page_kaddr = kmap_atomic(subpage);
5417 rc = copy_from_user(page_kaddr,
5418 usr_src + i * PAGE_SIZE, PAGE_SIZE);
5419 if (allow_pagefault)
5420 kunmap(subpage);
5421 else
5422 kunmap_atomic(page_kaddr);
5423
5424 ret_val -= (PAGE_SIZE - rc);
5425 if (rc)
5426 break;
5427
5428 cond_resched();
5429 }
5430 return ret_val;
5431}
5432#endif
5433
5434#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS
5435
5436static struct kmem_cache *page_ptl_cachep;
5437
5438void __init ptlock_cache_init(void)
5439{
5440 page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0,
5441 SLAB_PANIC, NULL);
5442}
5443
5444bool ptlock_alloc(struct page *page)
5445{
5446 spinlock_t *ptl;
5447
5448 ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
5449 if (!ptl)
5450 return false;
5451 page->ptl = ptl;
5452 return true;
5453}
5454
5455void ptlock_free(struct page *page)
5456{
5457 kmem_cache_free(page_ptl_cachep, page->ptl);
5458}
5459#endif
5460