1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42#include <linux/kernel_stat.h>
43#include <linux/mm.h>
44#include <linux/sched/mm.h>
45#include <linux/sched/coredump.h>
46#include <linux/sched/numa_balancing.h>
47#include <linux/sched/task.h>
48#include <linux/hugetlb.h>
49#include <linux/mman.h>
50#include <linux/swap.h>
51#include <linux/highmem.h>
52#include <linux/pagemap.h>
53#include <linux/memremap.h>
54#include <linux/ksm.h>
55#include <linux/rmap.h>
56#include <linux/export.h>
57#include <linux/delayacct.h>
58#include <linux/init.h>
59#include <linux/pfn_t.h>
60#include <linux/writeback.h>
61#include <linux/memcontrol.h>
62#include <linux/mmu_notifier.h>
63#include <linux/swapops.h>
64#include <linux/elf.h>
65#include <linux/gfp.h>
66#include <linux/migrate.h>
67#include <linux/string.h>
68#include <linux/debugfs.h>
69#include <linux/userfaultfd_k.h>
70#include <linux/dax.h>
71#include <linux/oom.h>
72#include <linux/numa.h>
73#include <linux/perf_event.h>
74#include <linux/ptrace.h>
75#include <linux/vmalloc.h>
76
77#include <trace/events/kmem.h>
78
79#include <asm/io.h>
80#include <asm/mmu_context.h>
81#include <asm/pgalloc.h>
82#include <linux/uaccess.h>
83#include <asm/tlb.h>
84#include <asm/tlbflush.h>
85
86#include "pgalloc-track.h"
87#include "internal.h"
88
89#if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST)
90#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
91#endif
92
93#ifndef CONFIG_NUMA
94unsigned long max_mapnr;
95EXPORT_SYMBOL(max_mapnr);
96
97struct page *mem_map;
98EXPORT_SYMBOL(mem_map);
99#endif
100
101
102
103
104
105
106
107
108void *high_memory;
109EXPORT_SYMBOL(high_memory);
110
111
112
113
114
115
116
117int randomize_va_space __read_mostly =
118#ifdef CONFIG_COMPAT_BRK
119 1;
120#else
121 2;
122#endif
123
124#ifndef arch_faults_on_old_pte
125static inline bool arch_faults_on_old_pte(void)
126{
127
128
129
130
131
132 return true;
133}
134#endif
135
136#ifndef arch_wants_old_prefaulted_pte
137static inline bool arch_wants_old_prefaulted_pte(void)
138{
139
140
141
142
143
144 return false;
145}
146#endif
147
148static int __init disable_randmaps(char *s)
149{
150 randomize_va_space = 0;
151 return 1;
152}
153__setup("norandmaps", disable_randmaps);
154
155unsigned long zero_pfn __read_mostly;
156EXPORT_SYMBOL(zero_pfn);
157
158unsigned long highest_memmap_pfn __read_mostly;
159
160
161
162
163static int __init init_zero_pfn(void)
164{
165 zero_pfn = page_to_pfn(ZERO_PAGE(0));
166 return 0;
167}
168early_initcall(init_zero_pfn);
169
170void mm_trace_rss_stat(struct mm_struct *mm, int member, long count)
171{
172 trace_rss_stat(mm, member, count);
173}
174
175#if defined(SPLIT_RSS_COUNTING)
176
177void sync_mm_rss(struct mm_struct *mm)
178{
179 int i;
180
181 for (i = 0; i < NR_MM_COUNTERS; i++) {
182 if (current->rss_stat.count[i]) {
183 add_mm_counter(mm, i, current->rss_stat.count[i]);
184 current->rss_stat.count[i] = 0;
185 }
186 }
187 current->rss_stat.events = 0;
188}
189
190static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
191{
192 struct task_struct *task = current;
193
194 if (likely(task->mm == mm))
195 task->rss_stat.count[member] += val;
196 else
197 add_mm_counter(mm, member, val);
198}
199#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
200#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
201
202
203#define TASK_RSS_EVENTS_THRESH (64)
204static void check_sync_rss_stat(struct task_struct *task)
205{
206 if (unlikely(task != current))
207 return;
208 if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
209 sync_mm_rss(task->mm);
210}
211#else
212
213#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
214#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
215
216static void check_sync_rss_stat(struct task_struct *task)
217{
218}
219
220#endif
221
222
223
224
225
226static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
227 unsigned long addr)
228{
229 pgtable_t token = pmd_pgtable(*pmd);
230 pmd_clear(pmd);
231 pte_free_tlb(tlb, token, addr);
232 mm_dec_nr_ptes(tlb->mm);
233}
234
235static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
236 unsigned long addr, unsigned long end,
237 unsigned long floor, unsigned long ceiling)
238{
239 pmd_t *pmd;
240 unsigned long next;
241 unsigned long start;
242
243 start = addr;
244 pmd = pmd_offset(pud, addr);
245 do {
246 next = pmd_addr_end(addr, end);
247 if (pmd_none_or_clear_bad(pmd))
248 continue;
249 free_pte_range(tlb, pmd, addr);
250 } while (pmd++, addr = next, addr != end);
251
252 start &= PUD_MASK;
253 if (start < floor)
254 return;
255 if (ceiling) {
256 ceiling &= PUD_MASK;
257 if (!ceiling)
258 return;
259 }
260 if (end - 1 > ceiling - 1)
261 return;
262
263 pmd = pmd_offset(pud, start);
264 pud_clear(pud);
265 pmd_free_tlb(tlb, pmd, start);
266 mm_dec_nr_pmds(tlb->mm);
267}
268
269static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
270 unsigned long addr, unsigned long end,
271 unsigned long floor, unsigned long ceiling)
272{
273 pud_t *pud;
274 unsigned long next;
275 unsigned long start;
276
277 start = addr;
278 pud = pud_offset(p4d, addr);
279 do {
280 next = pud_addr_end(addr, end);
281 if (pud_none_or_clear_bad(pud))
282 continue;
283 free_pmd_range(tlb, pud, addr, next, floor, ceiling);
284 } while (pud++, addr = next, addr != end);
285
286 start &= P4D_MASK;
287 if (start < floor)
288 return;
289 if (ceiling) {
290 ceiling &= P4D_MASK;
291 if (!ceiling)
292 return;
293 }
294 if (end - 1 > ceiling - 1)
295 return;
296
297 pud = pud_offset(p4d, start);
298 p4d_clear(p4d);
299 pud_free_tlb(tlb, pud, start);
300 mm_dec_nr_puds(tlb->mm);
301}
302
303static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
304 unsigned long addr, unsigned long end,
305 unsigned long floor, unsigned long ceiling)
306{
307 p4d_t *p4d;
308 unsigned long next;
309 unsigned long start;
310
311 start = addr;
312 p4d = p4d_offset(pgd, addr);
313 do {
314 next = p4d_addr_end(addr, end);
315 if (p4d_none_or_clear_bad(p4d))
316 continue;
317 free_pud_range(tlb, p4d, addr, next, floor, ceiling);
318 } while (p4d++, addr = next, addr != end);
319
320 start &= PGDIR_MASK;
321 if (start < floor)
322 return;
323 if (ceiling) {
324 ceiling &= PGDIR_MASK;
325 if (!ceiling)
326 return;
327 }
328 if (end - 1 > ceiling - 1)
329 return;
330
331 p4d = p4d_offset(pgd, start);
332 pgd_clear(pgd);
333 p4d_free_tlb(tlb, p4d, start);
334}
335
336
337
338
339void free_pgd_range(struct mmu_gather *tlb,
340 unsigned long addr, unsigned long end,
341 unsigned long floor, unsigned long ceiling)
342{
343 pgd_t *pgd;
344 unsigned long next;
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372 addr &= PMD_MASK;
373 if (addr < floor) {
374 addr += PMD_SIZE;
375 if (!addr)
376 return;
377 }
378 if (ceiling) {
379 ceiling &= PMD_MASK;
380 if (!ceiling)
381 return;
382 }
383 if (end - 1 > ceiling - 1)
384 end -= PMD_SIZE;
385 if (addr > end - 1)
386 return;
387
388
389
390
391 tlb_change_page_size(tlb, PAGE_SIZE);
392 pgd = pgd_offset(tlb->mm, addr);
393 do {
394 next = pgd_addr_end(addr, end);
395 if (pgd_none_or_clear_bad(pgd))
396 continue;
397 free_p4d_range(tlb, pgd, addr, next, floor, ceiling);
398 } while (pgd++, addr = next, addr != end);
399}
400
401void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
402 unsigned long floor, unsigned long ceiling)
403{
404 while (vma) {
405 struct vm_area_struct *next = vma->vm_next;
406 unsigned long addr = vma->vm_start;
407
408
409
410
411
412 unlink_anon_vmas(vma);
413 unlink_file_vma(vma);
414
415 if (is_vm_hugetlb_page(vma)) {
416 hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
417 floor, next ? next->vm_start : ceiling);
418 } else {
419
420
421
422 while (next && next->vm_start <= vma->vm_end + PMD_SIZE
423 && !is_vm_hugetlb_page(next)) {
424 vma = next;
425 next = vma->vm_next;
426 unlink_anon_vmas(vma);
427 unlink_file_vma(vma);
428 }
429 free_pgd_range(tlb, addr, vma->vm_end,
430 floor, next ? next->vm_start : ceiling);
431 }
432 vma = next;
433 }
434}
435
436int __pte_alloc(struct mm_struct *mm, pmd_t *pmd)
437{
438 spinlock_t *ptl;
439 pgtable_t new = pte_alloc_one(mm);
440 if (!new)
441 return -ENOMEM;
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456 smp_wmb();
457
458 ptl = pmd_lock(mm, pmd);
459 if (likely(pmd_none(*pmd))) {
460 mm_inc_nr_ptes(mm);
461 pmd_populate(mm, pmd, new);
462 new = NULL;
463 }
464 spin_unlock(ptl);
465 if (new)
466 pte_free(mm, new);
467 return 0;
468}
469
470int __pte_alloc_kernel(pmd_t *pmd)
471{
472 pte_t *new = pte_alloc_one_kernel(&init_mm);
473 if (!new)
474 return -ENOMEM;
475
476 smp_wmb();
477
478 spin_lock(&init_mm.page_table_lock);
479 if (likely(pmd_none(*pmd))) {
480 pmd_populate_kernel(&init_mm, pmd, new);
481 new = NULL;
482 }
483 spin_unlock(&init_mm.page_table_lock);
484 if (new)
485 pte_free_kernel(&init_mm, new);
486 return 0;
487}
488
489static inline void init_rss_vec(int *rss)
490{
491 memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
492}
493
494static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
495{
496 int i;
497
498 if (current->mm == mm)
499 sync_mm_rss(mm);
500 for (i = 0; i < NR_MM_COUNTERS; i++)
501 if (rss[i])
502 add_mm_counter(mm, i, rss[i]);
503}
504
505
506
507
508
509
510
511
512static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
513 pte_t pte, struct page *page)
514{
515 pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
516 p4d_t *p4d = p4d_offset(pgd, addr);
517 pud_t *pud = pud_offset(p4d, addr);
518 pmd_t *pmd = pmd_offset(pud, addr);
519 struct address_space *mapping;
520 pgoff_t index;
521 static unsigned long resume;
522 static unsigned long nr_shown;
523 static unsigned long nr_unshown;
524
525
526
527
528
529 if (nr_shown == 60) {
530 if (time_before(jiffies, resume)) {
531 nr_unshown++;
532 return;
533 }
534 if (nr_unshown) {
535 pr_alert("BUG: Bad page map: %lu messages suppressed\n",
536 nr_unshown);
537 nr_unshown = 0;
538 }
539 nr_shown = 0;
540 }
541 if (nr_shown++ == 0)
542 resume = jiffies + 60 * HZ;
543
544 mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
545 index = linear_page_index(vma, addr);
546
547 pr_alert("BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
548 current->comm,
549 (long long)pte_val(pte), (long long)pmd_val(*pmd));
550 if (page)
551 dump_page(page, "bad pte");
552 pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n",
553 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
554 pr_alert("file:%pD fault:%ps mmap:%ps readpage:%ps\n",
555 vma->vm_file,
556 vma->vm_ops ? vma->vm_ops->fault : NULL,
557 vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
558 mapping ? mapping->a_ops->readpage : NULL);
559 dump_stack();
560 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
561}
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
606 pte_t pte)
607{
608 unsigned long pfn = pte_pfn(pte);
609
610 if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) {
611 if (likely(!pte_special(pte)))
612 goto check_pfn;
613 if (vma->vm_ops && vma->vm_ops->find_special_page)
614 return vma->vm_ops->find_special_page(vma, addr);
615 if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
616 return NULL;
617 if (is_zero_pfn(pfn))
618 return NULL;
619 if (pte_devmap(pte))
620 return NULL;
621
622 print_bad_pte(vma, addr, pte, NULL);
623 return NULL;
624 }
625
626
627
628 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
629 if (vma->vm_flags & VM_MIXEDMAP) {
630 if (!pfn_valid(pfn))
631 return NULL;
632 goto out;
633 } else {
634 unsigned long off;
635 off = (addr - vma->vm_start) >> PAGE_SHIFT;
636 if (pfn == vma->vm_pgoff + off)
637 return NULL;
638 if (!is_cow_mapping(vma->vm_flags))
639 return NULL;
640 }
641 }
642
643 if (is_zero_pfn(pfn))
644 return NULL;
645
646check_pfn:
647 if (unlikely(pfn > highest_memmap_pfn)) {
648 print_bad_pte(vma, addr, pte, NULL);
649 return NULL;
650 }
651
652
653
654
655
656out:
657 return pfn_to_page(pfn);
658}
659
660#ifdef CONFIG_TRANSPARENT_HUGEPAGE
661struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
662 pmd_t pmd)
663{
664 unsigned long pfn = pmd_pfn(pmd);
665
666
667
668
669
670
671 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
672 if (vma->vm_flags & VM_MIXEDMAP) {
673 if (!pfn_valid(pfn))
674 return NULL;
675 goto out;
676 } else {
677 unsigned long off;
678 off = (addr - vma->vm_start) >> PAGE_SHIFT;
679 if (pfn == vma->vm_pgoff + off)
680 return NULL;
681 if (!is_cow_mapping(vma->vm_flags))
682 return NULL;
683 }
684 }
685
686 if (pmd_devmap(pmd))
687 return NULL;
688 if (is_huge_zero_pmd(pmd))
689 return NULL;
690 if (unlikely(pfn > highest_memmap_pfn))
691 return NULL;
692
693
694
695
696
697out:
698 return pfn_to_page(pfn);
699}
700#endif
701
702static void restore_exclusive_pte(struct vm_area_struct *vma,
703 struct page *page, unsigned long address,
704 pte_t *ptep)
705{
706 pte_t pte;
707 swp_entry_t entry;
708
709 pte = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
710 if (pte_swp_soft_dirty(*ptep))
711 pte = pte_mksoft_dirty(pte);
712
713 entry = pte_to_swp_entry(*ptep);
714 if (pte_swp_uffd_wp(*ptep))
715 pte = pte_mkuffd_wp(pte);
716 else if (is_writable_device_exclusive_entry(entry))
717 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
718
719 set_pte_at(vma->vm_mm, address, ptep, pte);
720
721
722
723
724
725 if (PageAnon(page))
726 page_add_anon_rmap(page, vma, address, false);
727 else
728
729
730
731
732 WARN_ON_ONCE(!PageAnon(page));
733
734 if (vma->vm_flags & VM_LOCKED)
735 mlock_vma_page(page);
736
737
738
739
740
741 update_mmu_cache(vma, address, ptep);
742}
743
744
745
746
747
748static int
749try_restore_exclusive_pte(pte_t *src_pte, struct vm_area_struct *vma,
750 unsigned long addr)
751{
752 swp_entry_t entry = pte_to_swp_entry(*src_pte);
753 struct page *page = pfn_swap_entry_to_page(entry);
754
755 if (trylock_page(page)) {
756 restore_exclusive_pte(vma, page, addr, src_pte);
757 unlock_page(page);
758 return 0;
759 }
760
761 return -EBUSY;
762}
763
764
765
766
767
768
769
770static unsigned long
771copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
772 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *dst_vma,
773 struct vm_area_struct *src_vma, unsigned long addr, int *rss)
774{
775 unsigned long vm_flags = dst_vma->vm_flags;
776 pte_t pte = *src_pte;
777 struct page *page;
778 swp_entry_t entry = pte_to_swp_entry(pte);
779
780 if (likely(!non_swap_entry(entry))) {
781 if (swap_duplicate(entry) < 0)
782 return -EIO;
783
784
785 if (unlikely(list_empty(&dst_mm->mmlist))) {
786 spin_lock(&mmlist_lock);
787 if (list_empty(&dst_mm->mmlist))
788 list_add(&dst_mm->mmlist,
789 &src_mm->mmlist);
790 spin_unlock(&mmlist_lock);
791 }
792 rss[MM_SWAPENTS]++;
793 } else if (is_migration_entry(entry)) {
794 page = pfn_swap_entry_to_page(entry);
795
796 rss[mm_counter(page)]++;
797
798 if (is_writable_migration_entry(entry) &&
799 is_cow_mapping(vm_flags)) {
800
801
802
803
804 entry = make_readable_migration_entry(
805 swp_offset(entry));
806 pte = swp_entry_to_pte(entry);
807 if (pte_swp_soft_dirty(*src_pte))
808 pte = pte_swp_mksoft_dirty(pte);
809 if (pte_swp_uffd_wp(*src_pte))
810 pte = pte_swp_mkuffd_wp(pte);
811 set_pte_at(src_mm, addr, src_pte, pte);
812 }
813 } else if (is_device_private_entry(entry)) {
814 page = pfn_swap_entry_to_page(entry);
815
816
817
818
819
820
821
822
823
824
825 get_page(page);
826 rss[mm_counter(page)]++;
827 page_dup_rmap(page, false);
828
829
830
831
832
833
834
835
836 if (is_writable_device_private_entry(entry) &&
837 is_cow_mapping(vm_flags)) {
838 entry = make_readable_device_private_entry(
839 swp_offset(entry));
840 pte = swp_entry_to_pte(entry);
841 if (pte_swp_uffd_wp(*src_pte))
842 pte = pte_swp_mkuffd_wp(pte);
843 set_pte_at(src_mm, addr, src_pte, pte);
844 }
845 } else if (is_device_exclusive_entry(entry)) {
846
847
848
849
850
851
852 VM_BUG_ON(!is_cow_mapping(src_vma->vm_flags));
853 if (try_restore_exclusive_pte(src_pte, src_vma, addr))
854 return -EBUSY;
855 return -ENOENT;
856 }
857 if (!userfaultfd_wp(dst_vma))
858 pte = pte_swp_clear_uffd_wp(pte);
859 set_pte_at(dst_mm, addr, dst_pte, pte);
860 return 0;
861}
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883static inline int
884copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
885 pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
886 struct page **prealloc, pte_t pte, struct page *page)
887{
888 struct page *new_page;
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903 if (likely(!page_needs_cow_for_dma(src_vma, page)))
904 return 1;
905
906 new_page = *prealloc;
907 if (!new_page)
908 return -EAGAIN;
909
910
911
912
913
914 *prealloc = NULL;
915 copy_user_highpage(new_page, page, addr, src_vma);
916 __SetPageUptodate(new_page);
917 page_add_new_anon_rmap(new_page, dst_vma, addr, false);
918 lru_cache_add_inactive_or_unevictable(new_page, dst_vma);
919 rss[mm_counter(new_page)]++;
920
921
922 pte = mk_pte(new_page, dst_vma->vm_page_prot);
923 pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma);
924 if (userfaultfd_pte_wp(dst_vma, *src_pte))
925
926 pte = pte_wrprotect(pte_mkuffd_wp(pte));
927 set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
928 return 0;
929}
930
931
932
933
934
935static inline int
936copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
937 pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
938 struct page **prealloc)
939{
940 struct mm_struct *src_mm = src_vma->vm_mm;
941 unsigned long vm_flags = src_vma->vm_flags;
942 pte_t pte = *src_pte;
943 struct page *page;
944
945 page = vm_normal_page(src_vma, addr, pte);
946 if (page) {
947 int retval;
948
949 retval = copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
950 addr, rss, prealloc, pte, page);
951 if (retval <= 0)
952 return retval;
953
954 get_page(page);
955 page_dup_rmap(page, false);
956 rss[mm_counter(page)]++;
957 }
958
959
960
961
962
963 if (is_cow_mapping(vm_flags) && pte_write(pte)) {
964 ptep_set_wrprotect(src_mm, addr, src_pte);
965 pte = pte_wrprotect(pte);
966 }
967
968
969
970
971
972 if (vm_flags & VM_SHARED)
973 pte = pte_mkclean(pte);
974 pte = pte_mkold(pte);
975
976 if (!userfaultfd_wp(dst_vma))
977 pte = pte_clear_uffd_wp(pte);
978
979 set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
980 return 0;
981}
982
983static inline struct page *
984page_copy_prealloc(struct mm_struct *src_mm, struct vm_area_struct *vma,
985 unsigned long addr)
986{
987 struct page *new_page;
988
989 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, addr);
990 if (!new_page)
991 return NULL;
992
993 if (mem_cgroup_charge(new_page, src_mm, GFP_KERNEL)) {
994 put_page(new_page);
995 return NULL;
996 }
997 cgroup_throttle_swaprate(new_page, GFP_KERNEL);
998
999 return new_page;
1000}
1001
1002static int
1003copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
1004 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
1005 unsigned long end)
1006{
1007 struct mm_struct *dst_mm = dst_vma->vm_mm;
1008 struct mm_struct *src_mm = src_vma->vm_mm;
1009 pte_t *orig_src_pte, *orig_dst_pte;
1010 pte_t *src_pte, *dst_pte;
1011 spinlock_t *src_ptl, *dst_ptl;
1012 int progress, ret = 0;
1013 int rss[NR_MM_COUNTERS];
1014 swp_entry_t entry = (swp_entry_t){0};
1015 struct page *prealloc = NULL;
1016
1017again:
1018 progress = 0;
1019 init_rss_vec(rss);
1020
1021 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
1022 if (!dst_pte) {
1023 ret = -ENOMEM;
1024 goto out;
1025 }
1026 src_pte = pte_offset_map(src_pmd, addr);
1027 src_ptl = pte_lockptr(src_mm, src_pmd);
1028 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
1029 orig_src_pte = src_pte;
1030 orig_dst_pte = dst_pte;
1031 arch_enter_lazy_mmu_mode();
1032
1033 do {
1034
1035
1036
1037
1038 if (progress >= 32) {
1039 progress = 0;
1040 if (need_resched() ||
1041 spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
1042 break;
1043 }
1044 if (pte_none(*src_pte)) {
1045 progress++;
1046 continue;
1047 }
1048 if (unlikely(!pte_present(*src_pte))) {
1049 ret = copy_nonpresent_pte(dst_mm, src_mm,
1050 dst_pte, src_pte,
1051 dst_vma, src_vma,
1052 addr, rss);
1053 if (ret == -EIO) {
1054 entry = pte_to_swp_entry(*src_pte);
1055 break;
1056 } else if (ret == -EBUSY) {
1057 break;
1058 } else if (!ret) {
1059 progress += 8;
1060 continue;
1061 }
1062
1063
1064
1065
1066
1067 WARN_ON_ONCE(ret != -ENOENT);
1068 }
1069
1070 ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte,
1071 addr, rss, &prealloc);
1072
1073
1074
1075
1076 if (unlikely(ret == -EAGAIN))
1077 break;
1078 if (unlikely(prealloc)) {
1079
1080
1081
1082
1083
1084
1085 put_page(prealloc);
1086 prealloc = NULL;
1087 }
1088 progress += 8;
1089 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
1090
1091 arch_leave_lazy_mmu_mode();
1092 spin_unlock(src_ptl);
1093 pte_unmap(orig_src_pte);
1094 add_mm_rss_vec(dst_mm, rss);
1095 pte_unmap_unlock(orig_dst_pte, dst_ptl);
1096 cond_resched();
1097
1098 if (ret == -EIO) {
1099 VM_WARN_ON_ONCE(!entry.val);
1100 if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) {
1101 ret = -ENOMEM;
1102 goto out;
1103 }
1104 entry.val = 0;
1105 } else if (ret == -EBUSY) {
1106 goto out;
1107 } else if (ret == -EAGAIN) {
1108 prealloc = page_copy_prealloc(src_mm, src_vma, addr);
1109 if (!prealloc)
1110 return -ENOMEM;
1111 } else if (ret) {
1112 VM_WARN_ON_ONCE(1);
1113 }
1114
1115
1116 ret = 0;
1117
1118 if (addr != end)
1119 goto again;
1120out:
1121 if (unlikely(prealloc))
1122 put_page(prealloc);
1123 return ret;
1124}
1125
1126static inline int
1127copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
1128 pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
1129 unsigned long end)
1130{
1131 struct mm_struct *dst_mm = dst_vma->vm_mm;
1132 struct mm_struct *src_mm = src_vma->vm_mm;
1133 pmd_t *src_pmd, *dst_pmd;
1134 unsigned long next;
1135
1136 dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
1137 if (!dst_pmd)
1138 return -ENOMEM;
1139 src_pmd = pmd_offset(src_pud, addr);
1140 do {
1141 next = pmd_addr_end(addr, end);
1142 if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)
1143 || pmd_devmap(*src_pmd)) {
1144 int err;
1145 VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma);
1146 err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd,
1147 addr, dst_vma, src_vma);
1148 if (err == -ENOMEM)
1149 return -ENOMEM;
1150 if (!err)
1151 continue;
1152
1153 }
1154 if (pmd_none_or_clear_bad(src_pmd))
1155 continue;
1156 if (copy_pte_range(dst_vma, src_vma, dst_pmd, src_pmd,
1157 addr, next))
1158 return -ENOMEM;
1159 } while (dst_pmd++, src_pmd++, addr = next, addr != end);
1160 return 0;
1161}
1162
1163static inline int
1164copy_pud_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
1165 p4d_t *dst_p4d, p4d_t *src_p4d, unsigned long addr,
1166 unsigned long end)
1167{
1168 struct mm_struct *dst_mm = dst_vma->vm_mm;
1169 struct mm_struct *src_mm = src_vma->vm_mm;
1170 pud_t *src_pud, *dst_pud;
1171 unsigned long next;
1172
1173 dst_pud = pud_alloc(dst_mm, dst_p4d, addr);
1174 if (!dst_pud)
1175 return -ENOMEM;
1176 src_pud = pud_offset(src_p4d, addr);
1177 do {
1178 next = pud_addr_end(addr, end);
1179 if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) {
1180 int err;
1181
1182 VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, src_vma);
1183 err = copy_huge_pud(dst_mm, src_mm,
1184 dst_pud, src_pud, addr, src_vma);
1185 if (err == -ENOMEM)
1186 return -ENOMEM;
1187 if (!err)
1188 continue;
1189
1190 }
1191 if (pud_none_or_clear_bad(src_pud))
1192 continue;
1193 if (copy_pmd_range(dst_vma, src_vma, dst_pud, src_pud,
1194 addr, next))
1195 return -ENOMEM;
1196 } while (dst_pud++, src_pud++, addr = next, addr != end);
1197 return 0;
1198}
1199
1200static inline int
1201copy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
1202 pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long addr,
1203 unsigned long end)
1204{
1205 struct mm_struct *dst_mm = dst_vma->vm_mm;
1206 p4d_t *src_p4d, *dst_p4d;
1207 unsigned long next;
1208
1209 dst_p4d = p4d_alloc(dst_mm, dst_pgd, addr);
1210 if (!dst_p4d)
1211 return -ENOMEM;
1212 src_p4d = p4d_offset(src_pgd, addr);
1213 do {
1214 next = p4d_addr_end(addr, end);
1215 if (p4d_none_or_clear_bad(src_p4d))
1216 continue;
1217 if (copy_pud_range(dst_vma, src_vma, dst_p4d, src_p4d,
1218 addr, next))
1219 return -ENOMEM;
1220 } while (dst_p4d++, src_p4d++, addr = next, addr != end);
1221 return 0;
1222}
1223
1224int
1225copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
1226{
1227 pgd_t *src_pgd, *dst_pgd;
1228 unsigned long next;
1229 unsigned long addr = src_vma->vm_start;
1230 unsigned long end = src_vma->vm_end;
1231 struct mm_struct *dst_mm = dst_vma->vm_mm;
1232 struct mm_struct *src_mm = src_vma->vm_mm;
1233 struct mmu_notifier_range range;
1234 bool is_cow;
1235 int ret;
1236
1237
1238
1239
1240
1241
1242
1243 if (!(src_vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) &&
1244 !src_vma->anon_vma)
1245 return 0;
1246
1247 if (is_vm_hugetlb_page(src_vma))
1248 return copy_hugetlb_page_range(dst_mm, src_mm, src_vma);
1249
1250 if (unlikely(src_vma->vm_flags & VM_PFNMAP)) {
1251
1252
1253
1254
1255 ret = track_pfn_copy(src_vma);
1256 if (ret)
1257 return ret;
1258 }
1259
1260
1261
1262
1263
1264
1265
1266 is_cow = is_cow_mapping(src_vma->vm_flags);
1267
1268 if (is_cow) {
1269 mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
1270 0, src_vma, src_mm, addr, end);
1271 mmu_notifier_invalidate_range_start(&range);
1272
1273
1274
1275
1276
1277
1278
1279 mmap_assert_write_locked(src_mm);
1280 raw_write_seqcount_begin(&src_mm->write_protect_seq);
1281 }
1282
1283 ret = 0;
1284 dst_pgd = pgd_offset(dst_mm, addr);
1285 src_pgd = pgd_offset(src_mm, addr);
1286 do {
1287 next = pgd_addr_end(addr, end);
1288 if (pgd_none_or_clear_bad(src_pgd))
1289 continue;
1290 if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd,
1291 addr, next))) {
1292 ret = -ENOMEM;
1293 break;
1294 }
1295 } while (dst_pgd++, src_pgd++, addr = next, addr != end);
1296
1297 if (is_cow) {
1298 raw_write_seqcount_end(&src_mm->write_protect_seq);
1299 mmu_notifier_invalidate_range_end(&range);
1300 }
1301 return ret;
1302}
1303
1304static unsigned long zap_pte_range(struct mmu_gather *tlb,
1305 struct vm_area_struct *vma, pmd_t *pmd,
1306 unsigned long addr, unsigned long end,
1307 struct zap_details *details)
1308{
1309 struct mm_struct *mm = tlb->mm;
1310 int force_flush = 0;
1311 int rss[NR_MM_COUNTERS];
1312 spinlock_t *ptl;
1313 pte_t *start_pte;
1314 pte_t *pte;
1315 swp_entry_t entry;
1316
1317 tlb_change_page_size(tlb, PAGE_SIZE);
1318again:
1319 init_rss_vec(rss);
1320 start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
1321 pte = start_pte;
1322 flush_tlb_batched_pending(mm);
1323 arch_enter_lazy_mmu_mode();
1324 do {
1325 pte_t ptent = *pte;
1326 if (pte_none(ptent))
1327 continue;
1328
1329 if (need_resched())
1330 break;
1331
1332 if (pte_present(ptent)) {
1333 struct page *page;
1334
1335 page = vm_normal_page(vma, addr, ptent);
1336 if (unlikely(details) && page) {
1337
1338
1339
1340
1341
1342 if (details->check_mapping &&
1343 details->check_mapping != page_rmapping(page))
1344 continue;
1345 }
1346 ptent = ptep_get_and_clear_full(mm, addr, pte,
1347 tlb->fullmm);
1348 tlb_remove_tlb_entry(tlb, pte, addr);
1349 if (unlikely(!page))
1350 continue;
1351
1352 if (!PageAnon(page)) {
1353 if (pte_dirty(ptent)) {
1354 force_flush = 1;
1355 set_page_dirty(page);
1356 }
1357 if (pte_young(ptent) &&
1358 likely(!(vma->vm_flags & VM_SEQ_READ)))
1359 mark_page_accessed(page);
1360 }
1361 rss[mm_counter(page)]--;
1362 page_remove_rmap(page, false);
1363 if (unlikely(page_mapcount(page) < 0))
1364 print_bad_pte(vma, addr, ptent, page);
1365 if (unlikely(__tlb_remove_page(tlb, page))) {
1366 force_flush = 1;
1367 addr += PAGE_SIZE;
1368 break;
1369 }
1370 continue;
1371 }
1372
1373 entry = pte_to_swp_entry(ptent);
1374 if (is_device_private_entry(entry) ||
1375 is_device_exclusive_entry(entry)) {
1376 struct page *page = pfn_swap_entry_to_page(entry);
1377
1378 if (unlikely(details && details->check_mapping)) {
1379
1380
1381
1382
1383
1384 if (details->check_mapping !=
1385 page_rmapping(page))
1386 continue;
1387 }
1388
1389 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
1390 rss[mm_counter(page)]--;
1391
1392 if (is_device_private_entry(entry))
1393 page_remove_rmap(page, false);
1394
1395 put_page(page);
1396 continue;
1397 }
1398
1399
1400 if (unlikely(details))
1401 continue;
1402
1403 if (!non_swap_entry(entry))
1404 rss[MM_SWAPENTS]--;
1405 else if (is_migration_entry(entry)) {
1406 struct page *page;
1407
1408 page = pfn_swap_entry_to_page(entry);
1409 rss[mm_counter(page)]--;
1410 }
1411 if (unlikely(!free_swap_and_cache(entry)))
1412 print_bad_pte(vma, addr, ptent, NULL);
1413 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
1414 } while (pte++, addr += PAGE_SIZE, addr != end);
1415
1416 add_mm_rss_vec(mm, rss);
1417 arch_leave_lazy_mmu_mode();
1418
1419
1420 if (force_flush)
1421 tlb_flush_mmu_tlbonly(tlb);
1422 pte_unmap_unlock(start_pte, ptl);
1423
1424
1425
1426
1427
1428
1429
1430 if (force_flush) {
1431 force_flush = 0;
1432 tlb_flush_mmu(tlb);
1433 }
1434
1435 if (addr != end) {
1436 cond_resched();
1437 goto again;
1438 }
1439
1440 return addr;
1441}
1442
1443static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1444 struct vm_area_struct *vma, pud_t *pud,
1445 unsigned long addr, unsigned long end,
1446 struct zap_details *details)
1447{
1448 pmd_t *pmd;
1449 unsigned long next;
1450
1451 pmd = pmd_offset(pud, addr);
1452 do {
1453 next = pmd_addr_end(addr, end);
1454 if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
1455 if (next - addr != HPAGE_PMD_SIZE)
1456 __split_huge_pmd(vma, pmd, addr, false, NULL);
1457 else if (zap_huge_pmd(tlb, vma, pmd, addr))
1458 goto next;
1459
1460 } else if (details && details->single_page &&
1461 PageTransCompound(details->single_page) &&
1462 next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) {
1463 spinlock_t *ptl = pmd_lock(tlb->mm, pmd);
1464
1465
1466
1467
1468
1469 spin_unlock(ptl);
1470 }
1471
1472
1473
1474
1475
1476
1477
1478
1479 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
1480 goto next;
1481 next = zap_pte_range(tlb, vma, pmd, addr, next, details);
1482next:
1483 cond_resched();
1484 } while (pmd++, addr = next, addr != end);
1485
1486 return addr;
1487}
1488
1489static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1490 struct vm_area_struct *vma, p4d_t *p4d,
1491 unsigned long addr, unsigned long end,
1492 struct zap_details *details)
1493{
1494 pud_t *pud;
1495 unsigned long next;
1496
1497 pud = pud_offset(p4d, addr);
1498 do {
1499 next = pud_addr_end(addr, end);
1500 if (pud_trans_huge(*pud) || pud_devmap(*pud)) {
1501 if (next - addr != HPAGE_PUD_SIZE) {
1502 mmap_assert_locked(tlb->mm);
1503 split_huge_pud(vma, pud, addr);
1504 } else if (zap_huge_pud(tlb, vma, pud, addr))
1505 goto next;
1506
1507 }
1508 if (pud_none_or_clear_bad(pud))
1509 continue;
1510 next = zap_pmd_range(tlb, vma, pud, addr, next, details);
1511next:
1512 cond_resched();
1513 } while (pud++, addr = next, addr != end);
1514
1515 return addr;
1516}
1517
1518static inline unsigned long zap_p4d_range(struct mmu_gather *tlb,
1519 struct vm_area_struct *vma, pgd_t *pgd,
1520 unsigned long addr, unsigned long end,
1521 struct zap_details *details)
1522{
1523 p4d_t *p4d;
1524 unsigned long next;
1525
1526 p4d = p4d_offset(pgd, addr);
1527 do {
1528 next = p4d_addr_end(addr, end);
1529 if (p4d_none_or_clear_bad(p4d))
1530 continue;
1531 next = zap_pud_range(tlb, vma, p4d, addr, next, details);
1532 } while (p4d++, addr = next, addr != end);
1533
1534 return addr;
1535}
1536
1537void unmap_page_range(struct mmu_gather *tlb,
1538 struct vm_area_struct *vma,
1539 unsigned long addr, unsigned long end,
1540 struct zap_details *details)
1541{
1542 pgd_t *pgd;
1543 unsigned long next;
1544
1545 BUG_ON(addr >= end);
1546 tlb_start_vma(tlb, vma);
1547 pgd = pgd_offset(vma->vm_mm, addr);
1548 do {
1549 next = pgd_addr_end(addr, end);
1550 if (pgd_none_or_clear_bad(pgd))
1551 continue;
1552 next = zap_p4d_range(tlb, vma, pgd, addr, next, details);
1553 } while (pgd++, addr = next, addr != end);
1554 tlb_end_vma(tlb, vma);
1555}
1556
1557
1558static void unmap_single_vma(struct mmu_gather *tlb,
1559 struct vm_area_struct *vma, unsigned long start_addr,
1560 unsigned long end_addr,
1561 struct zap_details *details)
1562{
1563 unsigned long start = max(vma->vm_start, start_addr);
1564 unsigned long end;
1565
1566 if (start >= vma->vm_end)
1567 return;
1568 end = min(vma->vm_end, end_addr);
1569 if (end <= vma->vm_start)
1570 return;
1571
1572 if (vma->vm_file)
1573 uprobe_munmap(vma, start, end);
1574
1575 if (unlikely(vma->vm_flags & VM_PFNMAP))
1576 untrack_pfn(vma, 0, 0);
1577
1578 if (start != end) {
1579 if (unlikely(is_vm_hugetlb_page(vma))) {
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591 if (vma->vm_file) {
1592 i_mmap_lock_write(vma->vm_file->f_mapping);
1593 __unmap_hugepage_range_final(tlb, vma, start, end, NULL);
1594 i_mmap_unlock_write(vma->vm_file->f_mapping);
1595 }
1596 } else
1597 unmap_page_range(tlb, vma, start, end, details);
1598 }
1599}
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619void unmap_vmas(struct mmu_gather *tlb,
1620 struct vm_area_struct *vma, unsigned long start_addr,
1621 unsigned long end_addr)
1622{
1623 struct mmu_notifier_range range;
1624
1625 mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
1626 start_addr, end_addr);
1627 mmu_notifier_invalidate_range_start(&range);
1628 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
1629 unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
1630 mmu_notifier_invalidate_range_end(&range);
1631}
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641void zap_page_range(struct vm_area_struct *vma, unsigned long start,
1642 unsigned long size)
1643{
1644 struct mmu_notifier_range range;
1645 struct mmu_gather tlb;
1646
1647 lru_add_drain();
1648 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
1649 start, start + size);
1650 tlb_gather_mmu(&tlb, vma->vm_mm);
1651 update_hiwater_rss(vma->vm_mm);
1652 mmu_notifier_invalidate_range_start(&range);
1653 for ( ; vma && vma->vm_start < range.end; vma = vma->vm_next)
1654 unmap_single_vma(&tlb, vma, start, range.end, NULL);
1655 mmu_notifier_invalidate_range_end(&range);
1656 tlb_finish_mmu(&tlb);
1657}
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
1669 unsigned long size, struct zap_details *details)
1670{
1671 struct mmu_notifier_range range;
1672 struct mmu_gather tlb;
1673
1674 lru_add_drain();
1675 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
1676 address, address + size);
1677 tlb_gather_mmu(&tlb, vma->vm_mm);
1678 update_hiwater_rss(vma->vm_mm);
1679 mmu_notifier_invalidate_range_start(&range);
1680 unmap_single_vma(&tlb, vma, address, range.end, details);
1681 mmu_notifier_invalidate_range_end(&range);
1682 tlb_finish_mmu(&tlb);
1683}
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
1697 unsigned long size)
1698{
1699 if (address < vma->vm_start || address + size > vma->vm_end ||
1700 !(vma->vm_flags & VM_PFNMAP))
1701 return;
1702
1703 zap_page_range_single(vma, address, size, NULL);
1704}
1705EXPORT_SYMBOL_GPL(zap_vma_ptes);
1706
1707static pmd_t *walk_to_pmd(struct mm_struct *mm, unsigned long addr)
1708{
1709 pgd_t *pgd;
1710 p4d_t *p4d;
1711 pud_t *pud;
1712 pmd_t *pmd;
1713
1714 pgd = pgd_offset(mm, addr);
1715 p4d = p4d_alloc(mm, pgd, addr);
1716 if (!p4d)
1717 return NULL;
1718 pud = pud_alloc(mm, p4d, addr);
1719 if (!pud)
1720 return NULL;
1721 pmd = pmd_alloc(mm, pud, addr);
1722 if (!pmd)
1723 return NULL;
1724
1725 VM_BUG_ON(pmd_trans_huge(*pmd));
1726 return pmd;
1727}
1728
1729pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
1730 spinlock_t **ptl)
1731{
1732 pmd_t *pmd = walk_to_pmd(mm, addr);
1733
1734 if (!pmd)
1735 return NULL;
1736 return pte_alloc_map_lock(mm, pmd, addr, ptl);
1737}
1738
1739static int validate_page_before_insert(struct page *page)
1740{
1741 if (PageAnon(page) || PageSlab(page) || page_has_type(page))
1742 return -EINVAL;
1743 flush_dcache_page(page);
1744 return 0;
1745}
1746
1747static int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte,
1748 unsigned long addr, struct page *page, pgprot_t prot)
1749{
1750 if (!pte_none(*pte))
1751 return -EBUSY;
1752
1753 get_page(page);
1754 inc_mm_counter_fast(mm, mm_counter_file(page));
1755 page_add_file_rmap(page, false);
1756 set_pte_at(mm, addr, pte, mk_pte(page, prot));
1757 return 0;
1758}
1759
1760
1761
1762
1763
1764
1765
1766
1767static int insert_page(struct vm_area_struct *vma, unsigned long addr,
1768 struct page *page, pgprot_t prot)
1769{
1770 struct mm_struct *mm = vma->vm_mm;
1771 int retval;
1772 pte_t *pte;
1773 spinlock_t *ptl;
1774
1775 retval = validate_page_before_insert(page);
1776 if (retval)
1777 goto out;
1778 retval = -ENOMEM;
1779 pte = get_locked_pte(mm, addr, &ptl);
1780 if (!pte)
1781 goto out;
1782 retval = insert_page_into_pte_locked(mm, pte, addr, page, prot);
1783 pte_unmap_unlock(pte, ptl);
1784out:
1785 return retval;
1786}
1787
1788#ifdef pte_index
1789static int insert_page_in_batch_locked(struct mm_struct *mm, pte_t *pte,
1790 unsigned long addr, struct page *page, pgprot_t prot)
1791{
1792 int err;
1793
1794 if (!page_count(page))
1795 return -EINVAL;
1796 err = validate_page_before_insert(page);
1797 if (err)
1798 return err;
1799 return insert_page_into_pte_locked(mm, pte, addr, page, prot);
1800}
1801
1802
1803
1804
1805static int insert_pages(struct vm_area_struct *vma, unsigned long addr,
1806 struct page **pages, unsigned long *num, pgprot_t prot)
1807{
1808 pmd_t *pmd = NULL;
1809 pte_t *start_pte, *pte;
1810 spinlock_t *pte_lock;
1811 struct mm_struct *const mm = vma->vm_mm;
1812 unsigned long curr_page_idx = 0;
1813 unsigned long remaining_pages_total = *num;
1814 unsigned long pages_to_write_in_pmd;
1815 int ret;
1816more:
1817 ret = -EFAULT;
1818 pmd = walk_to_pmd(mm, addr);
1819 if (!pmd)
1820 goto out;
1821
1822 pages_to_write_in_pmd = min_t(unsigned long,
1823 remaining_pages_total, PTRS_PER_PTE - pte_index(addr));
1824
1825
1826 ret = -ENOMEM;
1827 if (pte_alloc(mm, pmd))
1828 goto out;
1829
1830 while (pages_to_write_in_pmd) {
1831 int pte_idx = 0;
1832 const int batch_size = min_t(int, pages_to_write_in_pmd, 8);
1833
1834 start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock);
1835 for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) {
1836 int err = insert_page_in_batch_locked(mm, pte,
1837 addr, pages[curr_page_idx], prot);
1838 if (unlikely(err)) {
1839 pte_unmap_unlock(start_pte, pte_lock);
1840 ret = err;
1841 remaining_pages_total -= pte_idx;
1842 goto out;
1843 }
1844 addr += PAGE_SIZE;
1845 ++curr_page_idx;
1846 }
1847 pte_unmap_unlock(start_pte, pte_lock);
1848 pages_to_write_in_pmd -= batch_size;
1849 remaining_pages_total -= batch_size;
1850 }
1851 if (remaining_pages_total)
1852 goto more;
1853 ret = 0;
1854out:
1855 *num = remaining_pages_total;
1856 return ret;
1857}
1858#endif
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
1876 struct page **pages, unsigned long *num)
1877{
1878#ifdef pte_index
1879 const unsigned long end_addr = addr + (*num * PAGE_SIZE) - 1;
1880
1881 if (addr < vma->vm_start || end_addr >= vma->vm_end)
1882 return -EFAULT;
1883 if (!(vma->vm_flags & VM_MIXEDMAP)) {
1884 BUG_ON(mmap_read_trylock(vma->vm_mm));
1885 BUG_ON(vma->vm_flags & VM_PFNMAP);
1886 vma->vm_flags |= VM_MIXEDMAP;
1887 }
1888
1889 return insert_pages(vma, addr, pages, num, vma->vm_page_prot);
1890#else
1891 unsigned long idx = 0, pgcount = *num;
1892 int err = -EINVAL;
1893
1894 for (; idx < pgcount; ++idx) {
1895 err = vm_insert_page(vma, addr + (PAGE_SIZE * idx), pages[idx]);
1896 if (err)
1897 break;
1898 }
1899 *num = pgcount - idx;
1900 return err;
1901#endif
1902}
1903EXPORT_SYMBOL(vm_insert_pages);
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
1935 struct page *page)
1936{
1937 if (addr < vma->vm_start || addr >= vma->vm_end)
1938 return -EFAULT;
1939 if (!page_count(page))
1940 return -EINVAL;
1941 if (!(vma->vm_flags & VM_MIXEDMAP)) {
1942 BUG_ON(mmap_read_trylock(vma->vm_mm));
1943 BUG_ON(vma->vm_flags & VM_PFNMAP);
1944 vma->vm_flags |= VM_MIXEDMAP;
1945 }
1946 return insert_page(vma, addr, page, vma->vm_page_prot);
1947}
1948EXPORT_SYMBOL(vm_insert_page);
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961static int __vm_map_pages(struct vm_area_struct *vma, struct page **pages,
1962 unsigned long num, unsigned long offset)
1963{
1964 unsigned long count = vma_pages(vma);
1965 unsigned long uaddr = vma->vm_start;
1966 int ret, i;
1967
1968
1969 if (offset >= num)
1970 return -ENXIO;
1971
1972
1973 if (count > num - offset)
1974 return -ENXIO;
1975
1976 for (i = 0; i < count; i++) {
1977 ret = vm_insert_page(vma, uaddr, pages[offset + i]);
1978 if (ret < 0)
1979 return ret;
1980 uaddr += PAGE_SIZE;
1981 }
1982
1983 return 0;
1984}
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
2005 unsigned long num)
2006{
2007 return __vm_map_pages(vma, pages, num, vma->vm_pgoff);
2008}
2009EXPORT_SYMBOL(vm_map_pages);
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
2025 unsigned long num)
2026{
2027 return __vm_map_pages(vma, pages, num, 0);
2028}
2029EXPORT_SYMBOL(vm_map_pages_zero);
2030
2031static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
2032 pfn_t pfn, pgprot_t prot, bool mkwrite)
2033{
2034 struct mm_struct *mm = vma->vm_mm;
2035 pte_t *pte, entry;
2036 spinlock_t *ptl;
2037
2038 pte = get_locked_pte(mm, addr, &ptl);
2039 if (!pte)
2040 return VM_FAULT_OOM;
2041 if (!pte_none(*pte)) {
2042 if (mkwrite) {
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053 if (pte_pfn(*pte) != pfn_t_to_pfn(pfn)) {
2054 WARN_ON_ONCE(!is_zero_pfn(pte_pfn(*pte)));
2055 goto out_unlock;
2056 }
2057 entry = pte_mkyoung(*pte);
2058 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2059 if (ptep_set_access_flags(vma, addr, pte, entry, 1))
2060 update_mmu_cache(vma, addr, pte);
2061 }
2062 goto out_unlock;
2063 }
2064
2065
2066 if (pfn_t_devmap(pfn))
2067 entry = pte_mkdevmap(pfn_t_pte(pfn, prot));
2068 else
2069 entry = pte_mkspecial(pfn_t_pte(pfn, prot));
2070
2071 if (mkwrite) {
2072 entry = pte_mkyoung(entry);
2073 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2074 }
2075
2076 set_pte_at(mm, addr, pte, entry);
2077 update_mmu_cache(vma, addr, pte);
2078
2079out_unlock:
2080 pte_unmap_unlock(pte, ptl);
2081 return VM_FAULT_NOPAGE;
2082}
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
2106 unsigned long pfn, pgprot_t pgprot)
2107{
2108
2109
2110
2111
2112
2113
2114 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
2115 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
2116 (VM_PFNMAP|VM_MIXEDMAP));
2117 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
2118 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
2119
2120 if (addr < vma->vm_start || addr >= vma->vm_end)
2121 return VM_FAULT_SIGBUS;
2122
2123 if (!pfn_modify_allowed(pfn, pgprot))
2124 return VM_FAULT_SIGBUS;
2125
2126 track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
2127
2128 return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
2129 false);
2130}
2131EXPORT_SYMBOL(vmf_insert_pfn_prot);
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
2154 unsigned long pfn)
2155{
2156 return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
2157}
2158EXPORT_SYMBOL(vmf_insert_pfn);
2159
2160static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
2161{
2162
2163 if (vma->vm_flags & VM_MIXEDMAP)
2164 return true;
2165 if (pfn_t_devmap(pfn))
2166 return true;
2167 if (pfn_t_special(pfn))
2168 return true;
2169 if (is_zero_pfn(pfn_t_to_pfn(pfn)))
2170 return true;
2171 return false;
2172}
2173
2174static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
2175 unsigned long addr, pfn_t pfn, pgprot_t pgprot,
2176 bool mkwrite)
2177{
2178 int err;
2179
2180 BUG_ON(!vm_mixed_ok(vma, pfn));
2181
2182 if (addr < vma->vm_start || addr >= vma->vm_end)
2183 return VM_FAULT_SIGBUS;
2184
2185 track_pfn_insert(vma, &pgprot, pfn);
2186
2187 if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot))
2188 return VM_FAULT_SIGBUS;
2189
2190
2191
2192
2193
2194
2195
2196
2197 if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) &&
2198 !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) {
2199 struct page *page;
2200
2201
2202
2203
2204
2205
2206 page = pfn_to_page(pfn_t_to_pfn(pfn));
2207 err = insert_page(vma, addr, page, pgprot);
2208 } else {
2209 return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
2210 }
2211
2212 if (err == -ENOMEM)
2213 return VM_FAULT_OOM;
2214 if (err < 0 && err != -EBUSY)
2215 return VM_FAULT_SIGBUS;
2216
2217 return VM_FAULT_NOPAGE;
2218}
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246vm_fault_t vmf_insert_mixed_prot(struct vm_area_struct *vma, unsigned long addr,
2247 pfn_t pfn, pgprot_t pgprot)
2248{
2249 return __vm_insert_mixed(vma, addr, pfn, pgprot, false);
2250}
2251EXPORT_SYMBOL(vmf_insert_mixed_prot);
2252
2253vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
2254 pfn_t pfn)
2255{
2256 return __vm_insert_mixed(vma, addr, pfn, vma->vm_page_prot, false);
2257}
2258EXPORT_SYMBOL(vmf_insert_mixed);
2259
2260
2261
2262
2263
2264
2265vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
2266 unsigned long addr, pfn_t pfn)
2267{
2268 return __vm_insert_mixed(vma, addr, pfn, vma->vm_page_prot, true);
2269}
2270EXPORT_SYMBOL(vmf_insert_mixed_mkwrite);
2271
2272
2273
2274
2275
2276
2277static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
2278 unsigned long addr, unsigned long end,
2279 unsigned long pfn, pgprot_t prot)
2280{
2281 pte_t *pte, *mapped_pte;
2282 spinlock_t *ptl;
2283 int err = 0;
2284
2285 mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
2286 if (!pte)
2287 return -ENOMEM;
2288 arch_enter_lazy_mmu_mode();
2289 do {
2290 BUG_ON(!pte_none(*pte));
2291 if (!pfn_modify_allowed(pfn, prot)) {
2292 err = -EACCES;
2293 break;
2294 }
2295 set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
2296 pfn++;
2297 } while (pte++, addr += PAGE_SIZE, addr != end);
2298 arch_leave_lazy_mmu_mode();
2299 pte_unmap_unlock(mapped_pte, ptl);
2300 return err;
2301}
2302
2303static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
2304 unsigned long addr, unsigned long end,
2305 unsigned long pfn, pgprot_t prot)
2306{
2307 pmd_t *pmd;
2308 unsigned long next;
2309 int err;
2310
2311 pfn -= addr >> PAGE_SHIFT;
2312 pmd = pmd_alloc(mm, pud, addr);
2313 if (!pmd)
2314 return -ENOMEM;
2315 VM_BUG_ON(pmd_trans_huge(*pmd));
2316 do {
2317 next = pmd_addr_end(addr, end);
2318 err = remap_pte_range(mm, pmd, addr, next,
2319 pfn + (addr >> PAGE_SHIFT), prot);
2320 if (err)
2321 return err;
2322 } while (pmd++, addr = next, addr != end);
2323 return 0;
2324}
2325
2326static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d,
2327 unsigned long addr, unsigned long end,
2328 unsigned long pfn, pgprot_t prot)
2329{
2330 pud_t *pud;
2331 unsigned long next;
2332 int err;
2333
2334 pfn -= addr >> PAGE_SHIFT;
2335 pud = pud_alloc(mm, p4d, addr);
2336 if (!pud)
2337 return -ENOMEM;
2338 do {
2339 next = pud_addr_end(addr, end);
2340 err = remap_pmd_range(mm, pud, addr, next,
2341 pfn + (addr >> PAGE_SHIFT), prot);
2342 if (err)
2343 return err;
2344 } while (pud++, addr = next, addr != end);
2345 return 0;
2346}
2347
2348static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
2349 unsigned long addr, unsigned long end,
2350 unsigned long pfn, pgprot_t prot)
2351{
2352 p4d_t *p4d;
2353 unsigned long next;
2354 int err;
2355
2356 pfn -= addr >> PAGE_SHIFT;
2357 p4d = p4d_alloc(mm, pgd, addr);
2358 if (!p4d)
2359 return -ENOMEM;
2360 do {
2361 next = p4d_addr_end(addr, end);
2362 err = remap_pud_range(mm, p4d, addr, next,
2363 pfn + (addr >> PAGE_SHIFT), prot);
2364 if (err)
2365 return err;
2366 } while (p4d++, addr = next, addr != end);
2367 return 0;
2368}
2369
2370
2371
2372
2373
2374int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
2375 unsigned long pfn, unsigned long size, pgprot_t prot)
2376{
2377 pgd_t *pgd;
2378 unsigned long next;
2379 unsigned long end = addr + PAGE_ALIGN(size);
2380 struct mm_struct *mm = vma->vm_mm;
2381 int err;
2382
2383 if (WARN_ON_ONCE(!PAGE_ALIGNED(addr)))
2384 return -EINVAL;
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404 if (is_cow_mapping(vma->vm_flags)) {
2405 if (addr != vma->vm_start || end != vma->vm_end)
2406 return -EINVAL;
2407 vma->vm_pgoff = pfn;
2408 }
2409
2410 vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
2411
2412 BUG_ON(addr >= end);
2413 pfn -= addr >> PAGE_SHIFT;
2414 pgd = pgd_offset(mm, addr);
2415 flush_cache_range(vma, addr, end);
2416 do {
2417 next = pgd_addr_end(addr, end);
2418 err = remap_p4d_range(mm, pgd, addr, next,
2419 pfn + (addr >> PAGE_SHIFT), prot);
2420 if (err)
2421 return err;
2422 } while (pgd++, addr = next, addr != end);
2423
2424 return 0;
2425}
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
2440 unsigned long pfn, unsigned long size, pgprot_t prot)
2441{
2442 int err;
2443
2444 err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));
2445 if (err)
2446 return -EINVAL;
2447
2448 err = remap_pfn_range_notrack(vma, addr, pfn, size, prot);
2449 if (err)
2450 untrack_pfn(vma, pfn, PAGE_ALIGN(size));
2451 return err;
2452}
2453EXPORT_SYMBOL(remap_pfn_range);
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
2471{
2472 unsigned long vm_len, pfn, pages;
2473
2474
2475 if (start + len < start)
2476 return -EINVAL;
2477
2478
2479
2480
2481
2482 len += start & ~PAGE_MASK;
2483 pfn = start >> PAGE_SHIFT;
2484 pages = (len + ~PAGE_MASK) >> PAGE_SHIFT;
2485 if (pfn + pages < pfn)
2486 return -EINVAL;
2487
2488
2489 if (vma->vm_pgoff > pages)
2490 return -EINVAL;
2491 pfn += vma->vm_pgoff;
2492 pages -= vma->vm_pgoff;
2493
2494
2495 vm_len = vma->vm_end - vma->vm_start;
2496 if (vm_len >> PAGE_SHIFT > pages)
2497 return -EINVAL;
2498
2499
2500 return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
2501}
2502EXPORT_SYMBOL(vm_iomap_memory);
2503
2504static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
2505 unsigned long addr, unsigned long end,
2506 pte_fn_t fn, void *data, bool create,
2507 pgtbl_mod_mask *mask)
2508{
2509 pte_t *pte, *mapped_pte;
2510 int err = 0;
2511 spinlock_t *ptl;
2512
2513 if (create) {
2514 mapped_pte = pte = (mm == &init_mm) ?
2515 pte_alloc_kernel_track(pmd, addr, mask) :
2516 pte_alloc_map_lock(mm, pmd, addr, &ptl);
2517 if (!pte)
2518 return -ENOMEM;
2519 } else {
2520 mapped_pte = pte = (mm == &init_mm) ?
2521 pte_offset_kernel(pmd, addr) :
2522 pte_offset_map_lock(mm, pmd, addr, &ptl);
2523 }
2524
2525 BUG_ON(pmd_huge(*pmd));
2526
2527 arch_enter_lazy_mmu_mode();
2528
2529 if (fn) {
2530 do {
2531 if (create || !pte_none(*pte)) {
2532 err = fn(pte++, addr, data);
2533 if (err)
2534 break;
2535 }
2536 } while (addr += PAGE_SIZE, addr != end);
2537 }
2538 *mask |= PGTBL_PTE_MODIFIED;
2539
2540 arch_leave_lazy_mmu_mode();
2541
2542 if (mm != &init_mm)
2543 pte_unmap_unlock(mapped_pte, ptl);
2544 return err;
2545}
2546
2547static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
2548 unsigned long addr, unsigned long end,
2549 pte_fn_t fn, void *data, bool create,
2550 pgtbl_mod_mask *mask)
2551{
2552 pmd_t *pmd;
2553 unsigned long next;
2554 int err = 0;
2555
2556 BUG_ON(pud_huge(*pud));
2557
2558 if (create) {
2559 pmd = pmd_alloc_track(mm, pud, addr, mask);
2560 if (!pmd)
2561 return -ENOMEM;
2562 } else {
2563 pmd = pmd_offset(pud, addr);
2564 }
2565 do {
2566 next = pmd_addr_end(addr, end);
2567 if (pmd_none(*pmd) && !create)
2568 continue;
2569 if (WARN_ON_ONCE(pmd_leaf(*pmd)))
2570 return -EINVAL;
2571 if (!pmd_none(*pmd) && WARN_ON_ONCE(pmd_bad(*pmd))) {
2572 if (!create)
2573 continue;
2574 pmd_clear_bad(pmd);
2575 }
2576 err = apply_to_pte_range(mm, pmd, addr, next,
2577 fn, data, create, mask);
2578 if (err)
2579 break;
2580 } while (pmd++, addr = next, addr != end);
2581
2582 return err;
2583}
2584
2585static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
2586 unsigned long addr, unsigned long end,
2587 pte_fn_t fn, void *data, bool create,
2588 pgtbl_mod_mask *mask)
2589{
2590 pud_t *pud;
2591 unsigned long next;
2592 int err = 0;
2593
2594 if (create) {
2595 pud = pud_alloc_track(mm, p4d, addr, mask);
2596 if (!pud)
2597 return -ENOMEM;
2598 } else {
2599 pud = pud_offset(p4d, addr);
2600 }
2601 do {
2602 next = pud_addr_end(addr, end);
2603 if (pud_none(*pud) && !create)
2604 continue;
2605 if (WARN_ON_ONCE(pud_leaf(*pud)))
2606 return -EINVAL;
2607 if (!pud_none(*pud) && WARN_ON_ONCE(pud_bad(*pud))) {
2608 if (!create)
2609 continue;
2610 pud_clear_bad(pud);
2611 }
2612 err = apply_to_pmd_range(mm, pud, addr, next,
2613 fn, data, create, mask);
2614 if (err)
2615 break;
2616 } while (pud++, addr = next, addr != end);
2617
2618 return err;
2619}
2620
2621static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
2622 unsigned long addr, unsigned long end,
2623 pte_fn_t fn, void *data, bool create,
2624 pgtbl_mod_mask *mask)
2625{
2626 p4d_t *p4d;
2627 unsigned long next;
2628 int err = 0;
2629
2630 if (create) {
2631 p4d = p4d_alloc_track(mm, pgd, addr, mask);
2632 if (!p4d)
2633 return -ENOMEM;
2634 } else {
2635 p4d = p4d_offset(pgd, addr);
2636 }
2637 do {
2638 next = p4d_addr_end(addr, end);
2639 if (p4d_none(*p4d) && !create)
2640 continue;
2641 if (WARN_ON_ONCE(p4d_leaf(*p4d)))
2642 return -EINVAL;
2643 if (!p4d_none(*p4d) && WARN_ON_ONCE(p4d_bad(*p4d))) {
2644 if (!create)
2645 continue;
2646 p4d_clear_bad(p4d);
2647 }
2648 err = apply_to_pud_range(mm, p4d, addr, next,
2649 fn, data, create, mask);
2650 if (err)
2651 break;
2652 } while (p4d++, addr = next, addr != end);
2653
2654 return err;
2655}
2656
2657static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
2658 unsigned long size, pte_fn_t fn,
2659 void *data, bool create)
2660{
2661 pgd_t *pgd;
2662 unsigned long start = addr, next;
2663 unsigned long end = addr + size;
2664 pgtbl_mod_mask mask = 0;
2665 int err = 0;
2666
2667 if (WARN_ON(addr >= end))
2668 return -EINVAL;
2669
2670 pgd = pgd_offset(mm, addr);
2671 do {
2672 next = pgd_addr_end(addr, end);
2673 if (pgd_none(*pgd) && !create)
2674 continue;
2675 if (WARN_ON_ONCE(pgd_leaf(*pgd)))
2676 return -EINVAL;
2677 if (!pgd_none(*pgd) && WARN_ON_ONCE(pgd_bad(*pgd))) {
2678 if (!create)
2679 continue;
2680 pgd_clear_bad(pgd);
2681 }
2682 err = apply_to_p4d_range(mm, pgd, addr, next,
2683 fn, data, create, &mask);
2684 if (err)
2685 break;
2686 } while (pgd++, addr = next, addr != end);
2687
2688 if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
2689 arch_sync_kernel_mappings(start, start + size);
2690
2691 return err;
2692}
2693
2694
2695
2696
2697
2698int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
2699 unsigned long size, pte_fn_t fn, void *data)
2700{
2701 return __apply_to_page_range(mm, addr, size, fn, data, true);
2702}
2703EXPORT_SYMBOL_GPL(apply_to_page_range);
2704
2705
2706
2707
2708
2709
2710
2711
2712int apply_to_existing_page_range(struct mm_struct *mm, unsigned long addr,
2713 unsigned long size, pte_fn_t fn, void *data)
2714{
2715 return __apply_to_page_range(mm, addr, size, fn, data, false);
2716}
2717EXPORT_SYMBOL_GPL(apply_to_existing_page_range);
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
2728 pte_t *page_table, pte_t orig_pte)
2729{
2730 int same = 1;
2731#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPTION)
2732 if (sizeof(pte_t) > sizeof(unsigned long)) {
2733 spinlock_t *ptl = pte_lockptr(mm, pmd);
2734 spin_lock(ptl);
2735 same = pte_same(*page_table, orig_pte);
2736 spin_unlock(ptl);
2737 }
2738#endif
2739 pte_unmap(page_table);
2740 return same;
2741}
2742
2743static inline bool cow_user_page(struct page *dst, struct page *src,
2744 struct vm_fault *vmf)
2745{
2746 bool ret;
2747 void *kaddr;
2748 void __user *uaddr;
2749 bool locked = false;
2750 struct vm_area_struct *vma = vmf->vma;
2751 struct mm_struct *mm = vma->vm_mm;
2752 unsigned long addr = vmf->address;
2753
2754 if (likely(src)) {
2755 copy_user_highpage(dst, src, addr, vma);
2756 return true;
2757 }
2758
2759
2760
2761
2762
2763
2764
2765 kaddr = kmap_atomic(dst);
2766 uaddr = (void __user *)(addr & PAGE_MASK);
2767
2768
2769
2770
2771
2772 if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) {
2773 pte_t entry;
2774
2775 vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
2776 locked = true;
2777 if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
2778
2779
2780
2781
2782 update_mmu_tlb(vma, addr, vmf->pte);
2783 ret = false;
2784 goto pte_unlock;
2785 }
2786
2787 entry = pte_mkyoung(vmf->orig_pte);
2788 if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0))
2789 update_mmu_cache(vma, addr, vmf->pte);
2790 }
2791
2792
2793
2794
2795
2796
2797
2798 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
2799 if (locked)
2800 goto warn;
2801
2802
2803 vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
2804 locked = true;
2805 if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
2806
2807 update_mmu_tlb(vma, addr, vmf->pte);
2808 ret = false;
2809 goto pte_unlock;
2810 }
2811
2812
2813
2814
2815
2816 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
2817
2818
2819
2820
2821warn:
2822 WARN_ON_ONCE(1);
2823 clear_page(kaddr);
2824 }
2825 }
2826
2827 ret = true;
2828
2829pte_unlock:
2830 if (locked)
2831 pte_unmap_unlock(vmf->pte, vmf->ptl);
2832 kunmap_atomic(kaddr);
2833 flush_dcache_page(dst);
2834
2835 return ret;
2836}
2837
2838static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
2839{
2840 struct file *vm_file = vma->vm_file;
2841
2842 if (vm_file)
2843 return mapping_gfp_mask(vm_file->f_mapping) | __GFP_FS | __GFP_IO;
2844
2845
2846
2847
2848
2849 return GFP_KERNEL;
2850}
2851
2852
2853
2854
2855
2856
2857
2858static vm_fault_t do_page_mkwrite(struct vm_fault *vmf)
2859{
2860 vm_fault_t ret;
2861 struct page *page = vmf->page;
2862 unsigned int old_flags = vmf->flags;
2863
2864 vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
2865
2866 if (vmf->vma->vm_file &&
2867 IS_SWAPFILE(vmf->vma->vm_file->f_mapping->host))
2868 return VM_FAULT_SIGBUS;
2869
2870 ret = vmf->vma->vm_ops->page_mkwrite(vmf);
2871
2872 vmf->flags = old_flags;
2873 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
2874 return ret;
2875 if (unlikely(!(ret & VM_FAULT_LOCKED))) {
2876 lock_page(page);
2877 if (!page->mapping) {
2878 unlock_page(page);
2879 return 0;
2880 }
2881 ret |= VM_FAULT_LOCKED;
2882 } else
2883 VM_BUG_ON_PAGE(!PageLocked(page), page);
2884 return ret;
2885}
2886
2887
2888
2889
2890
2891
2892static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf)
2893{
2894 struct vm_area_struct *vma = vmf->vma;
2895 struct address_space *mapping;
2896 struct page *page = vmf->page;
2897 bool dirtied;
2898 bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;
2899
2900 dirtied = set_page_dirty(page);
2901 VM_BUG_ON_PAGE(PageAnon(page), page);
2902
2903
2904
2905
2906
2907
2908 mapping = page_rmapping(page);
2909 unlock_page(page);
2910
2911 if (!page_mkwrite)
2912 file_update_time(vma->vm_file);
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923 if ((dirtied || page_mkwrite) && mapping) {
2924 struct file *fpin;
2925
2926 fpin = maybe_unlock_mmap_for_io(vmf, NULL);
2927 balance_dirty_pages_ratelimited(mapping);
2928 if (fpin) {
2929 fput(fpin);
2930 return VM_FAULT_RETRY;
2931 }
2932 }
2933
2934 return 0;
2935}
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945static inline void wp_page_reuse(struct vm_fault *vmf)
2946 __releases(vmf->ptl)
2947{
2948 struct vm_area_struct *vma = vmf->vma;
2949 struct page *page = vmf->page;
2950 pte_t entry;
2951
2952
2953
2954
2955
2956 if (page)
2957 page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);
2958
2959 flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
2960 entry = pte_mkyoung(vmf->orig_pte);
2961 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2962 if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
2963 update_mmu_cache(vma, vmf->address, vmf->pte);
2964 pte_unmap_unlock(vmf->pte, vmf->ptl);
2965 count_vm_event(PGREUSE);
2966}
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984static vm_fault_t wp_page_copy(struct vm_fault *vmf)
2985{
2986 struct vm_area_struct *vma = vmf->vma;
2987 struct mm_struct *mm = vma->vm_mm;
2988 struct page *old_page = vmf->page;
2989 struct page *new_page = NULL;
2990 pte_t entry;
2991 int page_copied = 0;
2992 struct mmu_notifier_range range;
2993
2994 if (unlikely(anon_vma_prepare(vma)))
2995 goto oom;
2996
2997 if (is_zero_pfn(pte_pfn(vmf->orig_pte))) {
2998 new_page = alloc_zeroed_user_highpage_movable(vma,
2999 vmf->address);
3000 if (!new_page)
3001 goto oom;
3002 } else {
3003 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
3004 vmf->address);
3005 if (!new_page)
3006 goto oom;
3007
3008 if (!cow_user_page(new_page, old_page, vmf)) {
3009
3010
3011
3012
3013
3014
3015 put_page(new_page);
3016 if (old_page)
3017 put_page(old_page);
3018 return 0;
3019 }
3020 }
3021
3022 if (mem_cgroup_charge(new_page, mm, GFP_KERNEL))
3023 goto oom_free_new;
3024 cgroup_throttle_swaprate(new_page, GFP_KERNEL);
3025
3026 __SetPageUptodate(new_page);
3027
3028 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
3029 vmf->address & PAGE_MASK,
3030 (vmf->address & PAGE_MASK) + PAGE_SIZE);
3031 mmu_notifier_invalidate_range_start(&range);
3032
3033
3034
3035
3036 vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
3037 if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
3038 if (old_page) {
3039 if (!PageAnon(old_page)) {
3040 dec_mm_counter_fast(mm,
3041 mm_counter_file(old_page));
3042 inc_mm_counter_fast(mm, MM_ANONPAGES);
3043 }
3044 } else {
3045 inc_mm_counter_fast(mm, MM_ANONPAGES);
3046 }
3047 flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
3048 entry = mk_pte(new_page, vma->vm_page_prot);
3049 entry = pte_sw_mkyoung(entry);
3050 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
3051
3052
3053
3054
3055
3056
3057
3058
3059 ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
3060 page_add_new_anon_rmap(new_page, vma, vmf->address, false);
3061 lru_cache_add_inactive_or_unevictable(new_page, vma);
3062
3063
3064
3065
3066
3067 set_pte_at_notify(mm, vmf->address, vmf->pte, entry);
3068 update_mmu_cache(vma, vmf->address, vmf->pte);
3069 if (old_page) {
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092 page_remove_rmap(old_page, false);
3093 }
3094
3095
3096 new_page = old_page;
3097 page_copied = 1;
3098 } else {
3099 update_mmu_tlb(vma, vmf->address, vmf->pte);
3100 }
3101
3102 if (new_page)
3103 put_page(new_page);
3104
3105 pte_unmap_unlock(vmf->pte, vmf->ptl);
3106
3107
3108
3109
3110 mmu_notifier_invalidate_range_only_end(&range);
3111 if (old_page) {
3112
3113
3114
3115
3116 if (page_copied && (vma->vm_flags & VM_LOCKED)) {
3117 lock_page(old_page);
3118 if (PageMlocked(old_page))
3119 munlock_vma_page(old_page);
3120 unlock_page(old_page);
3121 }
3122 if (page_copied)
3123 free_swap_cache(old_page);
3124 put_page(old_page);
3125 }
3126 return page_copied ? VM_FAULT_WRITE : 0;
3127oom_free_new:
3128 put_page(new_page);
3129oom:
3130 if (old_page)
3131 put_page(old_page);
3132 return VM_FAULT_OOM;
3133}
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf)
3152{
3153 WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
3154 vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
3155 &vmf->ptl);
3156
3157
3158
3159
3160 if (!pte_same(*vmf->pte, vmf->orig_pte)) {
3161 update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
3162 pte_unmap_unlock(vmf->pte, vmf->ptl);
3163 return VM_FAULT_NOPAGE;
3164 }
3165 wp_page_reuse(vmf);
3166 return 0;
3167}
3168
3169
3170
3171
3172
3173static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
3174{
3175 struct vm_area_struct *vma = vmf->vma;
3176
3177 if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
3178 vm_fault_t ret;
3179
3180 pte_unmap_unlock(vmf->pte, vmf->ptl);
3181 vmf->flags |= FAULT_FLAG_MKWRITE;
3182 ret = vma->vm_ops->pfn_mkwrite(vmf);
3183 if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
3184 return ret;
3185 return finish_mkwrite_fault(vmf);
3186 }
3187 wp_page_reuse(vmf);
3188 return VM_FAULT_WRITE;
3189}
3190
3191static vm_fault_t wp_page_shared(struct vm_fault *vmf)
3192 __releases(vmf->ptl)
3193{
3194 struct vm_area_struct *vma = vmf->vma;
3195 vm_fault_t ret = VM_FAULT_WRITE;
3196
3197 get_page(vmf->page);
3198
3199 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
3200 vm_fault_t tmp;
3201
3202 pte_unmap_unlock(vmf->pte, vmf->ptl);
3203 tmp = do_page_mkwrite(vmf);
3204 if (unlikely(!tmp || (tmp &
3205 (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
3206 put_page(vmf->page);
3207 return tmp;
3208 }
3209 tmp = finish_mkwrite_fault(vmf);
3210 if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
3211 unlock_page(vmf->page);
3212 put_page(vmf->page);
3213 return tmp;
3214 }
3215 } else {
3216 wp_page_reuse(vmf);
3217 lock_page(vmf->page);
3218 }
3219 ret |= fault_dirty_shared_page(vmf);
3220 put_page(vmf->page);
3221
3222 return ret;
3223}
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243static vm_fault_t do_wp_page(struct vm_fault *vmf)
3244 __releases(vmf->ptl)
3245{
3246 struct vm_area_struct *vma = vmf->vma;
3247
3248 if (userfaultfd_pte_wp(vma, *vmf->pte)) {
3249 pte_unmap_unlock(vmf->pte, vmf->ptl);
3250 return handle_userfault(vmf, VM_UFFD_WP);
3251 }
3252
3253
3254
3255
3256
3257 if (unlikely(userfaultfd_wp(vmf->vma) &&
3258 mm_tlb_flush_pending(vmf->vma->vm_mm)))
3259 flush_tlb_page(vmf->vma, vmf->address);
3260
3261 vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
3262 if (!vmf->page) {
3263
3264
3265
3266
3267
3268
3269
3270 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
3271 (VM_WRITE|VM_SHARED))
3272 return wp_pfn_shared(vmf);
3273
3274 pte_unmap_unlock(vmf->pte, vmf->ptl);
3275 return wp_page_copy(vmf);
3276 }
3277
3278
3279
3280
3281
3282 if (PageAnon(vmf->page)) {
3283 struct page *page = vmf->page;
3284
3285
3286 if (PageKsm(page) || page_count(page) != 1)
3287 goto copy;
3288 if (!trylock_page(page))
3289 goto copy;
3290 if (PageKsm(page) || page_mapcount(page) != 1 || page_count(page) != 1) {
3291 unlock_page(page);
3292 goto copy;
3293 }
3294
3295
3296
3297
3298
3299 unlock_page(page);
3300 wp_page_reuse(vmf);
3301 return VM_FAULT_WRITE;
3302 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
3303 (VM_WRITE|VM_SHARED))) {
3304 return wp_page_shared(vmf);
3305 }
3306copy:
3307
3308
3309
3310 get_page(vmf->page);
3311
3312 pte_unmap_unlock(vmf->pte, vmf->ptl);
3313 return wp_page_copy(vmf);
3314}
3315
3316static void unmap_mapping_range_vma(struct vm_area_struct *vma,
3317 unsigned long start_addr, unsigned long end_addr,
3318 struct zap_details *details)
3319{
3320 zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
3321}
3322
3323static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
3324 struct zap_details *details)
3325{
3326 struct vm_area_struct *vma;
3327 pgoff_t vba, vea, zba, zea;
3328
3329 vma_interval_tree_foreach(vma, root,
3330 details->first_index, details->last_index) {
3331
3332 vba = vma->vm_pgoff;
3333 vea = vba + vma_pages(vma) - 1;
3334 zba = details->first_index;
3335 if (zba < vba)
3336 zba = vba;
3337 zea = details->last_index;
3338 if (zea > vea)
3339 zea = vea;
3340
3341 unmap_mapping_range_vma(vma,
3342 ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
3343 ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
3344 details);
3345 }
3346}
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359void unmap_mapping_page(struct page *page)
3360{
3361 struct address_space *mapping = page->mapping;
3362 struct zap_details details = { };
3363
3364 VM_BUG_ON(!PageLocked(page));
3365 VM_BUG_ON(PageTail(page));
3366
3367 details.check_mapping = mapping;
3368 details.first_index = page->index;
3369 details.last_index = page->index + thp_nr_pages(page) - 1;
3370 details.single_page = page;
3371
3372 i_mmap_lock_write(mapping);
3373 if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
3374 unmap_mapping_range_tree(&mapping->i_mmap, &details);
3375 i_mmap_unlock_write(mapping);
3376}
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390void unmap_mapping_pages(struct address_space *mapping, pgoff_t start,
3391 pgoff_t nr, bool even_cows)
3392{
3393 struct zap_details details = { };
3394
3395 details.check_mapping = even_cows ? NULL : mapping;
3396 details.first_index = start;
3397 details.last_index = start + nr - 1;
3398 if (details.last_index < details.first_index)
3399 details.last_index = ULONG_MAX;
3400
3401 i_mmap_lock_write(mapping);
3402 if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
3403 unmap_mapping_range_tree(&mapping->i_mmap, &details);
3404 i_mmap_unlock_write(mapping);
3405}
3406EXPORT_SYMBOL_GPL(unmap_mapping_pages);
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425void unmap_mapping_range(struct address_space *mapping,
3426 loff_t const holebegin, loff_t const holelen, int even_cows)
3427{
3428 pgoff_t hba = holebegin >> PAGE_SHIFT;
3429 pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
3430
3431
3432 if (sizeof(holelen) > sizeof(hlen)) {
3433 long long holeend =
3434 (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
3435 if (holeend & ~(long long)ULONG_MAX)
3436 hlen = ULONG_MAX - hba + 1;
3437 }
3438
3439 unmap_mapping_pages(mapping, hba, hlen, even_cows);
3440}
3441EXPORT_SYMBOL(unmap_mapping_range);
3442
3443
3444
3445
3446static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
3447{
3448 struct page *page = vmf->page;
3449 struct vm_area_struct *vma = vmf->vma;
3450 struct mmu_notifier_range range;
3451
3452 if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags))
3453 return VM_FAULT_RETRY;
3454 mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma,
3455 vma->vm_mm, vmf->address & PAGE_MASK,
3456 (vmf->address & PAGE_MASK) + PAGE_SIZE, NULL);
3457 mmu_notifier_invalidate_range_start(&range);
3458
3459 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
3460 &vmf->ptl);
3461 if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
3462 restore_exclusive_pte(vma, page, vmf->address, vmf->pte);
3463
3464 pte_unmap_unlock(vmf->pte, vmf->ptl);
3465 unlock_page(page);
3466
3467 mmu_notifier_invalidate_range_end(&range);
3468 return 0;
3469}
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479vm_fault_t do_swap_page(struct vm_fault *vmf)
3480{
3481 struct vm_area_struct *vma = vmf->vma;
3482 struct page *page = NULL, *swapcache;
3483 struct swap_info_struct *si = NULL;
3484 swp_entry_t entry;
3485 pte_t pte;
3486 int locked;
3487 int exclusive = 0;
3488 vm_fault_t ret = 0;
3489 void *shadow = NULL;
3490
3491 if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
3492 goto out;
3493
3494 entry = pte_to_swp_entry(vmf->orig_pte);
3495 if (unlikely(non_swap_entry(entry))) {
3496 if (is_migration_entry(entry)) {
3497 migration_entry_wait(vma->vm_mm, vmf->pmd,
3498 vmf->address);
3499 } else if (is_device_exclusive_entry(entry)) {
3500 vmf->page = pfn_swap_entry_to_page(entry);
3501 ret = remove_device_exclusive_entry(vmf);
3502 } else if (is_device_private_entry(entry)) {
3503 vmf->page = pfn_swap_entry_to_page(entry);
3504 ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
3505 } else if (is_hwpoison_entry(entry)) {
3506 ret = VM_FAULT_HWPOISON;
3507 } else {
3508 print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
3509 ret = VM_FAULT_SIGBUS;
3510 }
3511 goto out;
3512 }
3513
3514
3515 si = get_swap_device(entry);
3516 if (unlikely(!si))
3517 goto out;
3518
3519 delayacct_set_flag(current, DELAYACCT_PF_SWAPIN);
3520 page = lookup_swap_cache(entry, vma, vmf->address);
3521 swapcache = page;
3522
3523 if (!page) {
3524 if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
3525 __swap_count(entry) == 1) {
3526
3527 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
3528 vmf->address);
3529 if (page) {
3530 __SetPageLocked(page);
3531 __SetPageSwapBacked(page);
3532
3533 if (mem_cgroup_swapin_charge_page(page,
3534 vma->vm_mm, GFP_KERNEL, entry)) {
3535 ret = VM_FAULT_OOM;
3536 goto out_page;
3537 }
3538 mem_cgroup_swapin_uncharge_swap(entry);
3539
3540 shadow = get_shadow_from_swap_cache(entry);
3541 if (shadow)
3542 workingset_refault(page, shadow);
3543
3544 lru_cache_add(page);
3545
3546
3547 set_page_private(page, entry.val);
3548 swap_readpage(page, true);
3549 set_page_private(page, 0);
3550 }
3551 } else {
3552 page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
3553 vmf);
3554 swapcache = page;
3555 }
3556
3557 if (!page) {
3558
3559
3560
3561
3562 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
3563 vmf->address, &vmf->ptl);
3564 if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
3565 ret = VM_FAULT_OOM;
3566 delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN);
3567 goto unlock;
3568 }
3569
3570
3571 ret = VM_FAULT_MAJOR;
3572 count_vm_event(PGMAJFAULT);
3573 count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
3574 } else if (PageHWPoison(page)) {
3575
3576
3577
3578
3579 ret = VM_FAULT_HWPOISON;
3580 delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN);
3581 goto out_release;
3582 }
3583
3584 locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
3585
3586 delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN);
3587 if (!locked) {
3588 ret |= VM_FAULT_RETRY;
3589 goto out_release;
3590 }
3591
3592
3593
3594
3595
3596
3597
3598 if (unlikely((!PageSwapCache(page) ||
3599 page_private(page) != entry.val)) && swapcache)
3600 goto out_page;
3601
3602 page = ksm_might_need_to_copy(page, vma, vmf->address);
3603 if (unlikely(!page)) {
3604 ret = VM_FAULT_OOM;
3605 page = swapcache;
3606 goto out_page;
3607 }
3608
3609 cgroup_throttle_swaprate(page, GFP_KERNEL);
3610
3611
3612
3613
3614 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
3615 &vmf->ptl);
3616 if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte)))
3617 goto out_nomap;
3618
3619 if (unlikely(!PageUptodate(page))) {
3620 ret = VM_FAULT_SIGBUS;
3621 goto out_nomap;
3622 }
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
3635 dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
3636 pte = mk_pte(page, vma->vm_page_prot);
3637 if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
3638 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
3639 vmf->flags &= ~FAULT_FLAG_WRITE;
3640 ret |= VM_FAULT_WRITE;
3641 exclusive = RMAP_EXCLUSIVE;
3642 }
3643 flush_icache_page(vma, page);
3644 if (pte_swp_soft_dirty(vmf->orig_pte))
3645 pte = pte_mksoft_dirty(pte);
3646 if (pte_swp_uffd_wp(vmf->orig_pte)) {
3647 pte = pte_mkuffd_wp(pte);
3648 pte = pte_wrprotect(pte);
3649 }
3650 set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
3651 arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
3652 vmf->orig_pte = pte;
3653
3654
3655 if (unlikely(page != swapcache && swapcache)) {
3656 page_add_new_anon_rmap(page, vma, vmf->address, false);
3657 lru_cache_add_inactive_or_unevictable(page, vma);
3658 } else {
3659 do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
3660 }
3661
3662 swap_free(entry);
3663 if (mem_cgroup_swap_full(page) ||
3664 (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
3665 try_to_free_swap(page);
3666 unlock_page(page);
3667 if (page != swapcache && swapcache) {
3668
3669
3670
3671
3672
3673
3674
3675
3676 unlock_page(swapcache);
3677 put_page(swapcache);
3678 }
3679
3680 if (vmf->flags & FAULT_FLAG_WRITE) {
3681 ret |= do_wp_page(vmf);
3682 if (ret & VM_FAULT_ERROR)
3683 ret &= VM_FAULT_ERROR;
3684 goto out;
3685 }
3686
3687
3688 update_mmu_cache(vma, vmf->address, vmf->pte);
3689unlock:
3690 pte_unmap_unlock(vmf->pte, vmf->ptl);
3691out:
3692 if (si)
3693 put_swap_device(si);
3694 return ret;
3695out_nomap:
3696 pte_unmap_unlock(vmf->pte, vmf->ptl);
3697out_page:
3698 unlock_page(page);
3699out_release:
3700 put_page(page);
3701 if (page != swapcache && swapcache) {
3702 unlock_page(swapcache);
3703 put_page(swapcache);
3704 }
3705 if (si)
3706 put_swap_device(si);
3707 return ret;
3708}
3709
3710
3711
3712
3713
3714
3715static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
3716{
3717 struct vm_area_struct *vma = vmf->vma;
3718 struct page *page;
3719 vm_fault_t ret = 0;
3720 pte_t entry;
3721
3722
3723 if (vma->vm_flags & VM_SHARED)
3724 return VM_FAULT_SIGBUS;
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736 if (pte_alloc(vma->vm_mm, vmf->pmd))
3737 return VM_FAULT_OOM;
3738
3739
3740 if (unlikely(pmd_trans_unstable(vmf->pmd)))
3741 return 0;
3742
3743
3744 if (!(vmf->flags & FAULT_FLAG_WRITE) &&
3745 !mm_forbids_zeropage(vma->vm_mm)) {
3746 entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address),
3747 vma->vm_page_prot));
3748 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
3749 vmf->address, &vmf->ptl);
3750 if (!pte_none(*vmf->pte)) {
3751 update_mmu_tlb(vma, vmf->address, vmf->pte);
3752 goto unlock;
3753 }
3754 ret = check_stable_address_space(vma->vm_mm);
3755 if (ret)
3756 goto unlock;
3757
3758 if (userfaultfd_missing(vma)) {
3759 pte_unmap_unlock(vmf->pte, vmf->ptl);
3760 return handle_userfault(vmf, VM_UFFD_MISSING);
3761 }
3762 goto setpte;
3763 }
3764
3765
3766 if (unlikely(anon_vma_prepare(vma)))
3767 goto oom;
3768 page = alloc_zeroed_user_highpage_movable(vma, vmf->address);
3769 if (!page)
3770 goto oom;
3771
3772 if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
3773 goto oom_free_page;
3774 cgroup_throttle_swaprate(page, GFP_KERNEL);
3775
3776
3777
3778
3779
3780
3781 __SetPageUptodate(page);
3782
3783 entry = mk_pte(page, vma->vm_page_prot);
3784 entry = pte_sw_mkyoung(entry);
3785 if (vma->vm_flags & VM_WRITE)
3786 entry = pte_mkwrite(pte_mkdirty(entry));
3787
3788 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
3789 &vmf->ptl);
3790 if (!pte_none(*vmf->pte)) {
3791 update_mmu_cache(vma, vmf->address, vmf->pte);
3792 goto release;
3793 }
3794
3795 ret = check_stable_address_space(vma->vm_mm);
3796 if (ret)
3797 goto release;
3798
3799
3800 if (userfaultfd_missing(vma)) {
3801 pte_unmap_unlock(vmf->pte, vmf->ptl);
3802 put_page(page);
3803 return handle_userfault(vmf, VM_UFFD_MISSING);
3804 }
3805
3806 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
3807 page_add_new_anon_rmap(page, vma, vmf->address, false);
3808 lru_cache_add_inactive_or_unevictable(page, vma);
3809setpte:
3810 set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
3811
3812
3813 update_mmu_cache(vma, vmf->address, vmf->pte);
3814unlock:
3815 pte_unmap_unlock(vmf->pte, vmf->ptl);
3816 return ret;
3817release:
3818 put_page(page);
3819 goto unlock;
3820oom_free_page:
3821 put_page(page);
3822oom:
3823 return VM_FAULT_OOM;
3824}
3825
3826
3827
3828
3829
3830
3831static vm_fault_t __do_fault(struct vm_fault *vmf)
3832{
3833 struct vm_area_struct *vma = vmf->vma;
3834 vm_fault_t ret;
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851 if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) {
3852 vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
3853 if (!vmf->prealloc_pte)
3854 return VM_FAULT_OOM;
3855 smp_wmb();
3856 }
3857
3858 ret = vma->vm_ops->fault(vmf);
3859 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
3860 VM_FAULT_DONE_COW)))
3861 return ret;
3862
3863 if (unlikely(PageHWPoison(vmf->page))) {
3864 if (ret & VM_FAULT_LOCKED)
3865 unlock_page(vmf->page);
3866 put_page(vmf->page);
3867 vmf->page = NULL;
3868 return VM_FAULT_HWPOISON;
3869 }
3870
3871 if (unlikely(!(ret & VM_FAULT_LOCKED)))
3872 lock_page(vmf->page);
3873 else
3874 VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page);
3875
3876 return ret;
3877}
3878
3879#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3880static void deposit_prealloc_pte(struct vm_fault *vmf)
3881{
3882 struct vm_area_struct *vma = vmf->vma;
3883
3884 pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
3885
3886
3887
3888
3889 mm_inc_nr_ptes(vma->vm_mm);
3890 vmf->prealloc_pte = NULL;
3891}
3892
3893vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
3894{
3895 struct vm_area_struct *vma = vmf->vma;
3896 bool write = vmf->flags & FAULT_FLAG_WRITE;
3897 unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
3898 pmd_t entry;
3899 int i;
3900 vm_fault_t ret = VM_FAULT_FALLBACK;
3901
3902 if (!transhuge_vma_suitable(vma, haddr))
3903 return ret;
3904
3905 page = compound_head(page);
3906 if (compound_order(page) != HPAGE_PMD_ORDER)
3907 return ret;
3908
3909
3910
3911
3912
3913
3914
3915 if (unlikely(PageHasHWPoisoned(page)))
3916 return ret;
3917
3918
3919
3920
3921
3922 if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
3923 vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
3924 if (!vmf->prealloc_pte)
3925 return VM_FAULT_OOM;
3926 smp_wmb();
3927 }
3928
3929 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
3930 if (unlikely(!pmd_none(*vmf->pmd)))
3931 goto out;
3932
3933 for (i = 0; i < HPAGE_PMD_NR; i++)
3934 flush_icache_page(vma, page + i);
3935
3936 entry = mk_huge_pmd(page, vma->vm_page_prot);
3937 if (write)
3938 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
3939
3940 add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR);
3941 page_add_file_rmap(page, true);
3942
3943
3944
3945 if (arch_needs_pgtable_deposit())
3946 deposit_prealloc_pte(vmf);
3947
3948 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
3949
3950 update_mmu_cache_pmd(vma, haddr, vmf->pmd);
3951
3952
3953 ret = 0;
3954 count_vm_event(THP_FILE_MAPPED);
3955out:
3956 spin_unlock(vmf->ptl);
3957 return ret;
3958}
3959#else
3960vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
3961{
3962 return VM_FAULT_FALLBACK;
3963}
3964#endif
3965
3966void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
3967{
3968 struct vm_area_struct *vma = vmf->vma;
3969 bool write = vmf->flags & FAULT_FLAG_WRITE;
3970 bool prefault = vmf->address != addr;
3971 pte_t entry;
3972
3973 flush_icache_page(vma, page);
3974 entry = mk_pte(page, vma->vm_page_prot);
3975
3976 if (prefault && arch_wants_old_prefaulted_pte())
3977 entry = pte_mkold(entry);
3978 else
3979 entry = pte_sw_mkyoung(entry);
3980
3981 if (write)
3982 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
3983
3984 if (write && !(vma->vm_flags & VM_SHARED)) {
3985 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
3986 page_add_new_anon_rmap(page, vma, addr, false);
3987 lru_cache_add_inactive_or_unevictable(page, vma);
3988 } else {
3989 inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
3990 page_add_file_rmap(page, false);
3991 }
3992 set_pte_at(vma->vm_mm, addr, vmf->pte, entry);
3993}
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010vm_fault_t finish_fault(struct vm_fault *vmf)
4011{
4012 struct vm_area_struct *vma = vmf->vma;
4013 struct page *page;
4014 vm_fault_t ret;
4015
4016
4017 if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED))
4018 page = vmf->cow_page;
4019 else
4020 page = vmf->page;
4021
4022
4023
4024
4025
4026 if (!(vma->vm_flags & VM_SHARED)) {
4027 ret = check_stable_address_space(vma->vm_mm);
4028 if (ret)
4029 return ret;
4030 }
4031
4032 if (pmd_none(*vmf->pmd)) {
4033 if (PageTransCompound(page)) {
4034 ret = do_set_pmd(vmf, page);
4035 if (ret != VM_FAULT_FALLBACK)
4036 return ret;
4037 }
4038
4039 if (vmf->prealloc_pte) {
4040 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
4041 if (likely(pmd_none(*vmf->pmd))) {
4042 mm_inc_nr_ptes(vma->vm_mm);
4043 pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
4044 vmf->prealloc_pte = NULL;
4045 }
4046 spin_unlock(vmf->ptl);
4047 } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) {
4048 return VM_FAULT_OOM;
4049 }
4050 }
4051
4052
4053 if (pmd_devmap_trans_unstable(vmf->pmd))
4054 return 0;
4055
4056 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
4057 vmf->address, &vmf->ptl);
4058 ret = 0;
4059
4060 if (likely(pte_none(*vmf->pte)))
4061 do_set_pte(vmf, page, vmf->address);
4062 else
4063 ret = VM_FAULT_NOPAGE;
4064
4065 update_mmu_tlb(vma, vmf->address, vmf->pte);
4066 pte_unmap_unlock(vmf->pte, vmf->ptl);
4067 return ret;
4068}
4069
4070static unsigned long fault_around_bytes __read_mostly =
4071 rounddown_pow_of_two(65536);
4072
4073#ifdef CONFIG_DEBUG_FS
4074static int fault_around_bytes_get(void *data, u64 *val)
4075{
4076 *val = fault_around_bytes;
4077 return 0;
4078}
4079
4080
4081
4082
4083
4084static int fault_around_bytes_set(void *data, u64 val)
4085{
4086 if (val / PAGE_SIZE > PTRS_PER_PTE)
4087 return -EINVAL;
4088 if (val > PAGE_SIZE)
4089 fault_around_bytes = rounddown_pow_of_two(val);
4090 else
4091 fault_around_bytes = PAGE_SIZE;
4092 return 0;
4093}
4094DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops,
4095 fault_around_bytes_get, fault_around_bytes_set, "%llu\n");
4096
4097static int __init fault_around_debugfs(void)
4098{
4099 debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL,
4100 &fault_around_bytes_fops);
4101 return 0;
4102}
4103late_initcall(fault_around_debugfs);
4104#endif
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130static vm_fault_t do_fault_around(struct vm_fault *vmf)
4131{
4132 unsigned long address = vmf->address, nr_pages, mask;
4133 pgoff_t start_pgoff = vmf->pgoff;
4134 pgoff_t end_pgoff;
4135 int off;
4136
4137 nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
4138 mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
4139
4140 address = max(address & mask, vmf->vma->vm_start);
4141 off = ((vmf->address - address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
4142 start_pgoff -= off;
4143
4144
4145
4146
4147
4148 end_pgoff = start_pgoff -
4149 ((address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
4150 PTRS_PER_PTE - 1;
4151 end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1,
4152 start_pgoff + nr_pages - 1);
4153
4154 if (pmd_none(*vmf->pmd)) {
4155 vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
4156 if (!vmf->prealloc_pte)
4157 return VM_FAULT_OOM;
4158 smp_wmb();
4159 }
4160
4161 return vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);
4162}
4163
4164static vm_fault_t do_read_fault(struct vm_fault *vmf)
4165{
4166 struct vm_area_struct *vma = vmf->vma;
4167 vm_fault_t ret = 0;
4168
4169
4170
4171
4172
4173
4174 if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
4175 if (likely(!userfaultfd_minor(vmf->vma))) {
4176 ret = do_fault_around(vmf);
4177 if (ret)
4178 return ret;
4179 }
4180 }
4181
4182 ret = __do_fault(vmf);
4183 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
4184 return ret;
4185
4186 ret |= finish_fault(vmf);
4187 unlock_page(vmf->page);
4188 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
4189 put_page(vmf->page);
4190 return ret;
4191}
4192
4193static vm_fault_t do_cow_fault(struct vm_fault *vmf)
4194{
4195 struct vm_area_struct *vma = vmf->vma;
4196 vm_fault_t ret;
4197
4198 if (unlikely(anon_vma_prepare(vma)))
4199 return VM_FAULT_OOM;
4200
4201 vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address);
4202 if (!vmf->cow_page)
4203 return VM_FAULT_OOM;
4204
4205 if (mem_cgroup_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL)) {
4206 put_page(vmf->cow_page);
4207 return VM_FAULT_OOM;
4208 }
4209 cgroup_throttle_swaprate(vmf->cow_page, GFP_KERNEL);
4210
4211 ret = __do_fault(vmf);
4212 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
4213 goto uncharge_out;
4214 if (ret & VM_FAULT_DONE_COW)
4215 return ret;
4216
4217 copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma);
4218 __SetPageUptodate(vmf->cow_page);
4219
4220 ret |= finish_fault(vmf);
4221 unlock_page(vmf->page);
4222 put_page(vmf->page);
4223 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
4224 goto uncharge_out;
4225 return ret;
4226uncharge_out:
4227 put_page(vmf->cow_page);
4228 return ret;
4229}
4230
4231static vm_fault_t do_shared_fault(struct vm_fault *vmf)
4232{
4233 struct vm_area_struct *vma = vmf->vma;
4234 vm_fault_t ret, tmp;
4235
4236 ret = __do_fault(vmf);
4237 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
4238 return ret;
4239
4240
4241
4242
4243
4244 if (vma->vm_ops->page_mkwrite) {
4245 unlock_page(vmf->page);
4246 tmp = do_page_mkwrite(vmf);
4247 if (unlikely(!tmp ||
4248 (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
4249 put_page(vmf->page);
4250 return tmp;
4251 }
4252 }
4253
4254 ret |= finish_fault(vmf);
4255 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
4256 VM_FAULT_RETRY))) {
4257 unlock_page(vmf->page);
4258 put_page(vmf->page);
4259 return ret;
4260 }
4261
4262 ret |= fault_dirty_shared_page(vmf);
4263 return ret;
4264}
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274static vm_fault_t do_fault(struct vm_fault *vmf)
4275{
4276 struct vm_area_struct *vma = vmf->vma;
4277 struct mm_struct *vm_mm = vma->vm_mm;
4278 vm_fault_t ret;
4279
4280
4281
4282
4283 if (!vma->vm_ops->fault) {
4284
4285
4286
4287
4288 if (unlikely(!pmd_present(*vmf->pmd)))
4289 ret = VM_FAULT_SIGBUS;
4290 else {
4291 vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm,
4292 vmf->pmd,
4293 vmf->address,
4294 &vmf->ptl);
4295
4296
4297
4298
4299
4300
4301
4302 if (unlikely(pte_none(*vmf->pte)))
4303 ret = VM_FAULT_SIGBUS;
4304 else
4305 ret = VM_FAULT_NOPAGE;
4306
4307 pte_unmap_unlock(vmf->pte, vmf->ptl);
4308 }
4309 } else if (!(vmf->flags & FAULT_FLAG_WRITE))
4310 ret = do_read_fault(vmf);
4311 else if (!(vma->vm_flags & VM_SHARED))
4312 ret = do_cow_fault(vmf);
4313 else
4314 ret = do_shared_fault(vmf);
4315
4316
4317 if (vmf->prealloc_pte) {
4318 pte_free(vm_mm, vmf->prealloc_pte);
4319 vmf->prealloc_pte = NULL;
4320 }
4321 return ret;
4322}
4323
4324int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
4325 unsigned long addr, int page_nid, int *flags)
4326{
4327 get_page(page);
4328
4329 count_vm_numa_event(NUMA_HINT_FAULTS);
4330 if (page_nid == numa_node_id()) {
4331 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
4332 *flags |= TNF_FAULT_LOCAL;
4333 }
4334
4335 return mpol_misplaced(page, vma, addr);
4336}
4337
4338static vm_fault_t do_numa_page(struct vm_fault *vmf)
4339{
4340 struct vm_area_struct *vma = vmf->vma;
4341 struct page *page = NULL;
4342 int page_nid = NUMA_NO_NODE;
4343 int last_cpupid;
4344 int target_nid;
4345 pte_t pte, old_pte;
4346 bool was_writable = pte_savedwrite(vmf->orig_pte);
4347 int flags = 0;
4348
4349
4350
4351
4352
4353
4354 vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd);
4355 spin_lock(vmf->ptl);
4356 if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
4357 pte_unmap_unlock(vmf->pte, vmf->ptl);
4358 goto out;
4359 }
4360
4361
4362 old_pte = ptep_get(vmf->pte);
4363 pte = pte_modify(old_pte, vma->vm_page_prot);
4364
4365 page = vm_normal_page(vma, vmf->address, pte);
4366 if (!page)
4367 goto out_map;
4368
4369
4370 if (PageCompound(page))
4371 goto out_map;
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381 if (!was_writable)
4382 flags |= TNF_NO_GROUP;
4383
4384
4385
4386
4387
4388 if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
4389 flags |= TNF_SHARED;
4390
4391 last_cpupid = page_cpupid_last(page);
4392 page_nid = page_to_nid(page);
4393 target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
4394 &flags);
4395 if (target_nid == NUMA_NO_NODE) {
4396 put_page(page);
4397 goto out_map;
4398 }
4399 pte_unmap_unlock(vmf->pte, vmf->ptl);
4400
4401
4402 if (migrate_misplaced_page(page, vma, target_nid)) {
4403 page_nid = target_nid;
4404 flags |= TNF_MIGRATED;
4405 } else {
4406 flags |= TNF_MIGRATE_FAIL;
4407 vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
4408 spin_lock(vmf->ptl);
4409 if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
4410 pte_unmap_unlock(vmf->pte, vmf->ptl);
4411 goto out;
4412 }
4413 goto out_map;
4414 }
4415
4416out:
4417 if (page_nid != NUMA_NO_NODE)
4418 task_numa_fault(last_cpupid, page_nid, 1, flags);
4419 return 0;
4420out_map:
4421
4422
4423
4424
4425 old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte);
4426 pte = pte_modify(old_pte, vma->vm_page_prot);
4427 pte = pte_mkyoung(pte);
4428 if (was_writable)
4429 pte = pte_mkwrite(pte);
4430 ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte);
4431 update_mmu_cache(vma, vmf->address, vmf->pte);
4432 pte_unmap_unlock(vmf->pte, vmf->ptl);
4433 goto out;
4434}
4435
4436static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
4437{
4438 if (vma_is_anonymous(vmf->vma))
4439 return do_huge_pmd_anonymous_page(vmf);
4440 if (vmf->vma->vm_ops->huge_fault)
4441 return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
4442 return VM_FAULT_FALLBACK;
4443}
4444
4445
4446static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
4447{
4448 if (vma_is_anonymous(vmf->vma)) {
4449 if (userfaultfd_huge_pmd_wp(vmf->vma, vmf->orig_pmd))
4450 return handle_userfault(vmf, VM_UFFD_WP);
4451 return do_huge_pmd_wp_page(vmf);
4452 }
4453 if (vmf->vma->vm_ops->huge_fault) {
4454 vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
4455
4456 if (!(ret & VM_FAULT_FALLBACK))
4457 return ret;
4458 }
4459
4460
4461 __split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);
4462
4463 return VM_FAULT_FALLBACK;
4464}
4465
4466static vm_fault_t create_huge_pud(struct vm_fault *vmf)
4467{
4468#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
4469 defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
4470
4471 if (vma_is_anonymous(vmf->vma))
4472 goto split;
4473 if (vmf->vma->vm_ops->huge_fault) {
4474 vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
4475
4476 if (!(ret & VM_FAULT_FALLBACK))
4477 return ret;
4478 }
4479split:
4480
4481 __split_huge_pud(vmf->vma, vmf->pud, vmf->address);
4482#endif
4483 return VM_FAULT_FALLBACK;
4484}
4485
4486static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
4487{
4488#ifdef CONFIG_TRANSPARENT_HUGEPAGE
4489
4490 if (vma_is_anonymous(vmf->vma))
4491 return VM_FAULT_FALLBACK;
4492 if (vmf->vma->vm_ops->huge_fault)
4493 return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
4494#endif
4495 return VM_FAULT_FALLBACK;
4496}
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
4514{
4515 pte_t entry;
4516
4517 if (unlikely(pmd_none(*vmf->pmd))) {
4518
4519
4520
4521
4522
4523
4524 vmf->pte = NULL;
4525 } else {
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538 if (pmd_devmap_trans_unstable(vmf->pmd))
4539 return 0;
4540
4541
4542
4543
4544
4545
4546 vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
4547 vmf->orig_pte = *vmf->pte;
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557 barrier();
4558 if (pte_none(vmf->orig_pte)) {
4559 pte_unmap(vmf->pte);
4560 vmf->pte = NULL;
4561 }
4562 }
4563
4564 if (!vmf->pte) {
4565 if (vma_is_anonymous(vmf->vma))
4566 return do_anonymous_page(vmf);
4567 else
4568 return do_fault(vmf);
4569 }
4570
4571 if (!pte_present(vmf->orig_pte))
4572 return do_swap_page(vmf);
4573
4574 if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
4575 return do_numa_page(vmf);
4576
4577 vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
4578 spin_lock(vmf->ptl);
4579 entry = vmf->orig_pte;
4580 if (unlikely(!pte_same(*vmf->pte, entry))) {
4581 update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
4582 goto unlock;
4583 }
4584 if (vmf->flags & FAULT_FLAG_WRITE) {
4585 if (!pte_write(entry))
4586 return do_wp_page(vmf);
4587 entry = pte_mkdirty(entry);
4588 }
4589 entry = pte_mkyoung(entry);
4590 if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
4591 vmf->flags & FAULT_FLAG_WRITE)) {
4592 update_mmu_cache(vmf->vma, vmf->address, vmf->pte);
4593 } else {
4594
4595 if (vmf->flags & FAULT_FLAG_TRIED)
4596 goto unlock;
4597
4598
4599
4600
4601
4602
4603 if (vmf->flags & FAULT_FLAG_WRITE)
4604 flush_tlb_fix_spurious_fault(vmf->vma, vmf->address);
4605 }
4606unlock:
4607 pte_unmap_unlock(vmf->pte, vmf->ptl);
4608 return 0;
4609}
4610
4611
4612
4613
4614
4615
4616
4617static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
4618 unsigned long address, unsigned int flags)
4619{
4620 struct vm_fault vmf = {
4621 .vma = vma,
4622 .address = address & PAGE_MASK,
4623 .flags = flags,
4624 .pgoff = linear_page_index(vma, address),
4625 .gfp_mask = __get_fault_gfp_mask(vma),
4626 };
4627 unsigned int dirty = flags & FAULT_FLAG_WRITE;
4628 struct mm_struct *mm = vma->vm_mm;
4629 pgd_t *pgd;
4630 p4d_t *p4d;
4631 vm_fault_t ret;
4632
4633 pgd = pgd_offset(mm, address);
4634 p4d = p4d_alloc(mm, pgd, address);
4635 if (!p4d)
4636 return VM_FAULT_OOM;
4637
4638 vmf.pud = pud_alloc(mm, p4d, address);
4639 if (!vmf.pud)
4640 return VM_FAULT_OOM;
4641retry_pud:
4642 if (pud_none(*vmf.pud) && __transparent_hugepage_enabled(vma)) {
4643 ret = create_huge_pud(&vmf);
4644 if (!(ret & VM_FAULT_FALLBACK))
4645 return ret;
4646 } else {
4647 pud_t orig_pud = *vmf.pud;
4648
4649 barrier();
4650 if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {
4651
4652
4653
4654 if (dirty && !pud_write(orig_pud)) {
4655 ret = wp_huge_pud(&vmf, orig_pud);
4656 if (!(ret & VM_FAULT_FALLBACK))
4657 return ret;
4658 } else {
4659 huge_pud_set_accessed(&vmf, orig_pud);
4660 return 0;
4661 }
4662 }
4663 }
4664
4665 vmf.pmd = pmd_alloc(mm, vmf.pud, address);
4666 if (!vmf.pmd)
4667 return VM_FAULT_OOM;
4668
4669
4670 if (pud_trans_unstable(vmf.pud))
4671 goto retry_pud;
4672
4673 if (pmd_none(*vmf.pmd) && __transparent_hugepage_enabled(vma)) {
4674 ret = create_huge_pmd(&vmf);
4675 if (!(ret & VM_FAULT_FALLBACK))
4676 return ret;
4677 } else {
4678 vmf.orig_pmd = *vmf.pmd;
4679
4680 barrier();
4681 if (unlikely(is_swap_pmd(vmf.orig_pmd))) {
4682 VM_BUG_ON(thp_migration_supported() &&
4683 !is_pmd_migration_entry(vmf.orig_pmd));
4684 if (is_pmd_migration_entry(vmf.orig_pmd))
4685 pmd_migration_entry_wait(mm, vmf.pmd);
4686 return 0;
4687 }
4688 if (pmd_trans_huge(vmf.orig_pmd) || pmd_devmap(vmf.orig_pmd)) {
4689 if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma))
4690 return do_huge_pmd_numa_page(&vmf);
4691
4692 if (dirty && !pmd_write(vmf.orig_pmd)) {
4693 ret = wp_huge_pmd(&vmf);
4694 if (!(ret & VM_FAULT_FALLBACK))
4695 return ret;
4696 } else {
4697 huge_pmd_set_accessed(&vmf);
4698 return 0;
4699 }
4700 }
4701 }
4702
4703 return handle_pte_fault(&vmf);
4704}
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721static inline void mm_account_fault(struct pt_regs *regs,
4722 unsigned long address, unsigned int flags,
4723 vm_fault_t ret)
4724{
4725 bool major;
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738 if (ret & (VM_FAULT_ERROR | VM_FAULT_RETRY))
4739 return;
4740
4741
4742
4743
4744
4745
4746 major = (ret & VM_FAULT_MAJOR) || (flags & FAULT_FLAG_TRIED);
4747
4748 if (major)
4749 current->maj_flt++;
4750 else
4751 current->min_flt++;
4752
4753
4754
4755
4756
4757
4758 if (!regs)
4759 return;
4760
4761 if (major)
4762 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
4763 else
4764 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
4765}
4766
4767
4768
4769
4770
4771
4772
4773vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
4774 unsigned int flags, struct pt_regs *regs)
4775{
4776 vm_fault_t ret;
4777
4778 __set_current_state(TASK_RUNNING);
4779
4780 count_vm_event(PGFAULT);
4781 count_memcg_event_mm(vma->vm_mm, PGFAULT);
4782
4783
4784 check_sync_rss_stat(current);
4785
4786 if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
4787 flags & FAULT_FLAG_INSTRUCTION,
4788 flags & FAULT_FLAG_REMOTE))
4789 return VM_FAULT_SIGSEGV;
4790
4791
4792
4793
4794
4795 if (flags & FAULT_FLAG_USER)
4796 mem_cgroup_enter_user_fault();
4797
4798 if (unlikely(is_vm_hugetlb_page(vma)))
4799 ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
4800 else
4801 ret = __handle_mm_fault(vma, address, flags);
4802
4803 if (flags & FAULT_FLAG_USER) {
4804 mem_cgroup_exit_user_fault();
4805
4806
4807
4808
4809
4810
4811 if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
4812 mem_cgroup_oom_synchronize(false);
4813 }
4814
4815 mm_account_fault(regs, address, flags, ret);
4816
4817 return ret;
4818}
4819EXPORT_SYMBOL_GPL(handle_mm_fault);
4820
4821#ifndef __PAGETABLE_P4D_FOLDED
4822
4823
4824
4825
4826int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
4827{
4828 p4d_t *new = p4d_alloc_one(mm, address);
4829 if (!new)
4830 return -ENOMEM;
4831
4832 smp_wmb();
4833
4834 spin_lock(&mm->page_table_lock);
4835 if (pgd_present(*pgd))
4836 p4d_free(mm, new);
4837 else
4838 pgd_populate(mm, pgd, new);
4839 spin_unlock(&mm->page_table_lock);
4840 return 0;
4841}
4842#endif
4843
4844#ifndef __PAGETABLE_PUD_FOLDED
4845
4846
4847
4848
4849int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address)
4850{
4851 pud_t *new = pud_alloc_one(mm, address);
4852 if (!new)
4853 return -ENOMEM;
4854
4855 smp_wmb();
4856
4857 spin_lock(&mm->page_table_lock);
4858 if (!p4d_present(*p4d)) {
4859 mm_inc_nr_puds(mm);
4860 p4d_populate(mm, p4d, new);
4861 } else
4862 pud_free(mm, new);
4863 spin_unlock(&mm->page_table_lock);
4864 return 0;
4865}
4866#endif
4867
4868#ifndef __PAGETABLE_PMD_FOLDED
4869
4870
4871
4872
4873int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
4874{
4875 spinlock_t *ptl;
4876 pmd_t *new = pmd_alloc_one(mm, address);
4877 if (!new)
4878 return -ENOMEM;
4879
4880 smp_wmb();
4881
4882 ptl = pud_lock(mm, pud);
4883 if (!pud_present(*pud)) {
4884 mm_inc_nr_pmds(mm);
4885 pud_populate(mm, pud, new);
4886 } else
4887 pmd_free(mm, new);
4888 spin_unlock(ptl);
4889 return 0;
4890}
4891#endif
4892
4893int follow_invalidate_pte(struct mm_struct *mm, unsigned long address,
4894 struct mmu_notifier_range *range, pte_t **ptepp,
4895 pmd_t **pmdpp, spinlock_t **ptlp)
4896{
4897 pgd_t *pgd;
4898 p4d_t *p4d;
4899 pud_t *pud;
4900 pmd_t *pmd;
4901 pte_t *ptep;
4902
4903 pgd = pgd_offset(mm, address);
4904 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
4905 goto out;
4906
4907 p4d = p4d_offset(pgd, address);
4908 if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d)))
4909 goto out;
4910
4911 pud = pud_offset(p4d, address);
4912 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
4913 goto out;
4914
4915 pmd = pmd_offset(pud, address);
4916 VM_BUG_ON(pmd_trans_huge(*pmd));
4917
4918 if (pmd_huge(*pmd)) {
4919 if (!pmdpp)
4920 goto out;
4921
4922 if (range) {
4923 mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0,
4924 NULL, mm, address & PMD_MASK,
4925 (address & PMD_MASK) + PMD_SIZE);
4926 mmu_notifier_invalidate_range_start(range);
4927 }
4928 *ptlp = pmd_lock(mm, pmd);
4929 if (pmd_huge(*pmd)) {
4930 *pmdpp = pmd;
4931 return 0;
4932 }
4933 spin_unlock(*ptlp);
4934 if (range)
4935 mmu_notifier_invalidate_range_end(range);
4936 }
4937
4938 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
4939 goto out;
4940
4941 if (range) {
4942 mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
4943 address & PAGE_MASK,
4944 (address & PAGE_MASK) + PAGE_SIZE);
4945 mmu_notifier_invalidate_range_start(range);
4946 }
4947 ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
4948 if (!pte_present(*ptep))
4949 goto unlock;
4950 *ptepp = ptep;
4951 return 0;
4952unlock:
4953 pte_unmap_unlock(ptep, *ptlp);
4954 if (range)
4955 mmu_notifier_invalidate_range_end(range);
4956out:
4957 return -EINVAL;
4958}
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981int follow_pte(struct mm_struct *mm, unsigned long address,
4982 pte_t **ptepp, spinlock_t **ptlp)
4983{
4984 return follow_invalidate_pte(mm, address, NULL, ptepp, NULL, ptlp);
4985}
4986EXPORT_SYMBOL_GPL(follow_pte);
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001int follow_pfn(struct vm_area_struct *vma, unsigned long address,
5002 unsigned long *pfn)
5003{
5004 int ret = -EINVAL;
5005 spinlock_t *ptl;
5006 pte_t *ptep;
5007
5008 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
5009 return ret;
5010
5011 ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
5012 if (ret)
5013 return ret;
5014 *pfn = pte_pfn(*ptep);
5015 pte_unmap_unlock(ptep, ptl);
5016 return 0;
5017}
5018EXPORT_SYMBOL(follow_pfn);
5019
5020#ifdef CONFIG_HAVE_IOREMAP_PROT
5021int follow_phys(struct vm_area_struct *vma,
5022 unsigned long address, unsigned int flags,
5023 unsigned long *prot, resource_size_t *phys)
5024{
5025 int ret = -EINVAL;
5026 pte_t *ptep, pte;
5027 spinlock_t *ptl;
5028
5029 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
5030 goto out;
5031
5032 if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
5033 goto out;
5034 pte = *ptep;
5035
5036 if ((flags & FOLL_WRITE) && !pte_write(pte))
5037 goto unlock;
5038
5039 *prot = pgprot_val(pte_pgprot(pte));
5040 *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
5041
5042 ret = 0;
5043unlock:
5044 pte_unmap_unlock(ptep, ptl);
5045out:
5046 return ret;
5047}
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
5062 void *buf, int len, int write)
5063{
5064 resource_size_t phys_addr;
5065 unsigned long prot = 0;
5066 void __iomem *maddr;
5067 pte_t *ptep, pte;
5068 spinlock_t *ptl;
5069 int offset = offset_in_page(addr);
5070 int ret = -EINVAL;
5071
5072 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
5073 return -EINVAL;
5074
5075retry:
5076 if (follow_pte(vma->vm_mm, addr, &ptep, &ptl))
5077 return -EINVAL;
5078 pte = *ptep;
5079 pte_unmap_unlock(ptep, ptl);
5080
5081 prot = pgprot_val(pte_pgprot(pte));
5082 phys_addr = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
5083
5084 if ((write & FOLL_WRITE) && !pte_write(pte))
5085 return -EINVAL;
5086
5087 maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
5088 if (!maddr)
5089 return -ENOMEM;
5090
5091 if (follow_pte(vma->vm_mm, addr, &ptep, &ptl))
5092 goto out_unmap;
5093
5094 if (!pte_same(pte, *ptep)) {
5095 pte_unmap_unlock(ptep, ptl);
5096 iounmap(maddr);
5097
5098 goto retry;
5099 }
5100
5101 if (write)
5102 memcpy_toio(maddr + offset, buf, len);
5103 else
5104 memcpy_fromio(buf, maddr + offset, len);
5105 ret = len;
5106 pte_unmap_unlock(ptep, ptl);
5107out_unmap:
5108 iounmap(maddr);
5109
5110 return ret;
5111}
5112EXPORT_SYMBOL_GPL(generic_access_phys);
5113#endif
5114
5115
5116
5117
5118int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf,
5119 int len, unsigned int gup_flags)
5120{
5121 struct vm_area_struct *vma;
5122 void *old_buf = buf;
5123 int write = gup_flags & FOLL_WRITE;
5124
5125 if (mmap_read_lock_killable(mm))
5126 return 0;
5127
5128
5129 while (len) {
5130 int bytes, ret, offset;
5131 void *maddr;
5132 struct page *page = NULL;
5133
5134 ret = get_user_pages_remote(mm, addr, 1,
5135 gup_flags, &page, &vma, NULL);
5136 if (ret <= 0) {
5137#ifndef CONFIG_HAVE_IOREMAP_PROT
5138 break;
5139#else
5140
5141
5142
5143
5144 vma = vma_lookup(mm, addr);
5145 if (!vma)
5146 break;
5147 if (vma->vm_ops && vma->vm_ops->access)
5148 ret = vma->vm_ops->access(vma, addr, buf,
5149 len, write);
5150 if (ret <= 0)
5151 break;
5152 bytes = ret;
5153#endif
5154 } else {
5155 bytes = len;
5156 offset = addr & (PAGE_SIZE-1);
5157 if (bytes > PAGE_SIZE-offset)
5158 bytes = PAGE_SIZE-offset;
5159
5160 maddr = kmap(page);
5161 if (write) {
5162 copy_to_user_page(vma, page, addr,
5163 maddr + offset, buf, bytes);
5164 set_page_dirty_lock(page);
5165 } else {
5166 copy_from_user_page(vma, page, addr,
5167 buf, maddr + offset, bytes);
5168 }
5169 kunmap(page);
5170 put_page(page);
5171 }
5172 len -= bytes;
5173 buf += bytes;
5174 addr += bytes;
5175 }
5176 mmap_read_unlock(mm);
5177
5178 return buf - old_buf;
5179}
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193int access_remote_vm(struct mm_struct *mm, unsigned long addr,
5194 void *buf, int len, unsigned int gup_flags)
5195{
5196 return __access_remote_vm(mm, addr, buf, len, gup_flags);
5197}
5198
5199
5200
5201
5202
5203
5204int access_process_vm(struct task_struct *tsk, unsigned long addr,
5205 void *buf, int len, unsigned int gup_flags)
5206{
5207 struct mm_struct *mm;
5208 int ret;
5209
5210 mm = get_task_mm(tsk);
5211 if (!mm)
5212 return 0;
5213
5214 ret = __access_remote_vm(mm, addr, buf, len, gup_flags);
5215
5216 mmput(mm);
5217
5218 return ret;
5219}
5220EXPORT_SYMBOL_GPL(access_process_vm);
5221
5222
5223
5224
5225void print_vma_addr(char *prefix, unsigned long ip)
5226{
5227 struct mm_struct *mm = current->mm;
5228 struct vm_area_struct *vma;
5229
5230
5231
5232
5233 if (!mmap_read_trylock(mm))
5234 return;
5235
5236 vma = find_vma(mm, ip);
5237 if (vma && vma->vm_file) {
5238 struct file *f = vma->vm_file;
5239 char *buf = (char *)__get_free_page(GFP_NOWAIT);
5240 if (buf) {
5241 char *p;
5242
5243 p = file_path(f, buf, PAGE_SIZE);
5244 if (IS_ERR(p))
5245 p = "?";
5246 printk("%s%s[%lx+%lx]", prefix, kbasename(p),
5247 vma->vm_start,
5248 vma->vm_end - vma->vm_start);
5249 free_page((unsigned long)buf);
5250 }
5251 }
5252 mmap_read_unlock(mm);
5253}
5254
5255#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
5256void __might_fault(const char *file, int line)
5257{
5258
5259
5260
5261
5262
5263
5264 if (uaccess_kernel())
5265 return;
5266 if (pagefault_disabled())
5267 return;
5268 __might_sleep(file, line, 0);
5269#if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
5270 if (current->mm)
5271 might_lock_read(¤t->mm->mmap_lock);
5272#endif
5273}
5274EXPORT_SYMBOL(__might_fault);
5275#endif
5276
5277#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
5278
5279
5280
5281
5282
5283static inline void process_huge_page(
5284 unsigned long addr_hint, unsigned int pages_per_huge_page,
5285 void (*process_subpage)(unsigned long addr, int idx, void *arg),
5286 void *arg)
5287{
5288 int i, n, base, l;
5289 unsigned long addr = addr_hint &
5290 ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
5291
5292
5293 might_sleep();
5294 n = (addr_hint - addr) / PAGE_SIZE;
5295 if (2 * n <= pages_per_huge_page) {
5296
5297 base = 0;
5298 l = n;
5299
5300 for (i = pages_per_huge_page - 1; i >= 2 * n; i--) {
5301 cond_resched();
5302 process_subpage(addr + i * PAGE_SIZE, i, arg);
5303 }
5304 } else {
5305
5306 base = pages_per_huge_page - 2 * (pages_per_huge_page - n);
5307 l = pages_per_huge_page - n;
5308
5309 for (i = 0; i < base; i++) {
5310 cond_resched();
5311 process_subpage(addr + i * PAGE_SIZE, i, arg);
5312 }
5313 }
5314
5315
5316
5317
5318 for (i = 0; i < l; i++) {
5319 int left_idx = base + i;
5320 int right_idx = base + 2 * l - 1 - i;
5321
5322 cond_resched();
5323 process_subpage(addr + left_idx * PAGE_SIZE, left_idx, arg);
5324 cond_resched();
5325 process_subpage(addr + right_idx * PAGE_SIZE, right_idx, arg);
5326 }
5327}
5328
5329static void clear_gigantic_page(struct page *page,
5330 unsigned long addr,
5331 unsigned int pages_per_huge_page)
5332{
5333 int i;
5334 struct page *p = page;
5335
5336 might_sleep();
5337 for (i = 0; i < pages_per_huge_page;
5338 i++, p = mem_map_next(p, page, i)) {
5339 cond_resched();
5340 clear_user_highpage(p, addr + i * PAGE_SIZE);
5341 }
5342}
5343
5344static void clear_subpage(unsigned long addr, int idx, void *arg)
5345{
5346 struct page *page = arg;
5347
5348 clear_user_highpage(page + idx, addr);
5349}
5350
5351void clear_huge_page(struct page *page,
5352 unsigned long addr_hint, unsigned int pages_per_huge_page)
5353{
5354 unsigned long addr = addr_hint &
5355 ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
5356
5357 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
5358 clear_gigantic_page(page, addr, pages_per_huge_page);
5359 return;
5360 }
5361
5362 process_huge_page(addr_hint, pages_per_huge_page, clear_subpage, page);
5363}
5364
5365static void copy_user_gigantic_page(struct page *dst, struct page *src,
5366 unsigned long addr,
5367 struct vm_area_struct *vma,
5368 unsigned int pages_per_huge_page)
5369{
5370 int i;
5371 struct page *dst_base = dst;
5372 struct page *src_base = src;
5373
5374 for (i = 0; i < pages_per_huge_page; ) {
5375 cond_resched();
5376 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
5377
5378 i++;
5379 dst = mem_map_next(dst, dst_base, i);
5380 src = mem_map_next(src, src_base, i);
5381 }
5382}
5383
5384struct copy_subpage_arg {
5385 struct page *dst;
5386 struct page *src;
5387 struct vm_area_struct *vma;
5388};
5389
5390static void copy_subpage(unsigned long addr, int idx, void *arg)
5391{
5392 struct copy_subpage_arg *copy_arg = arg;
5393
5394 copy_user_highpage(copy_arg->dst + idx, copy_arg->src + idx,
5395 addr, copy_arg->vma);
5396}
5397
5398void copy_user_huge_page(struct page *dst, struct page *src,
5399 unsigned long addr_hint, struct vm_area_struct *vma,
5400 unsigned int pages_per_huge_page)
5401{
5402 unsigned long addr = addr_hint &
5403 ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
5404 struct copy_subpage_arg arg = {
5405 .dst = dst,
5406 .src = src,
5407 .vma = vma,
5408 };
5409
5410 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
5411 copy_user_gigantic_page(dst, src, addr, vma,
5412 pages_per_huge_page);
5413 return;
5414 }
5415
5416 process_huge_page(addr_hint, pages_per_huge_page, copy_subpage, &arg);
5417}
5418
5419long copy_huge_page_from_user(struct page *dst_page,
5420 const void __user *usr_src,
5421 unsigned int pages_per_huge_page,
5422 bool allow_pagefault)
5423{
5424 void *src = (void *)usr_src;
5425 void *page_kaddr;
5426 unsigned long i, rc = 0;
5427 unsigned long ret_val = pages_per_huge_page * PAGE_SIZE;
5428 struct page *subpage = dst_page;
5429
5430 for (i = 0; i < pages_per_huge_page;
5431 i++, subpage = mem_map_next(subpage, dst_page, i)) {
5432 if (allow_pagefault)
5433 page_kaddr = kmap(subpage);
5434 else
5435 page_kaddr = kmap_atomic(subpage);
5436 rc = copy_from_user(page_kaddr,
5437 (const void __user *)(src + i * PAGE_SIZE),
5438 PAGE_SIZE);
5439 if (allow_pagefault)
5440 kunmap(subpage);
5441 else
5442 kunmap_atomic(page_kaddr);
5443
5444 ret_val -= (PAGE_SIZE - rc);
5445 if (rc)
5446 break;
5447
5448 cond_resched();
5449 }
5450 return ret_val;
5451}
5452#endif
5453
5454#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS
5455
5456static struct kmem_cache *page_ptl_cachep;
5457
5458void __init ptlock_cache_init(void)
5459{
5460 page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0,
5461 SLAB_PANIC, NULL);
5462}
5463
5464bool ptlock_alloc(struct page *page)
5465{
5466 spinlock_t *ptl;
5467
5468 ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
5469 if (!ptl)
5470 return false;
5471 page->ptl = ptl;
5472 return true;
5473}
5474
5475void ptlock_free(struct page *page)
5476{
5477 kmem_cache_free(page_ptl_cachep, page->ptl);
5478}
5479#endif
5480