1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42#include <linux/kernel_stat.h>
43#include <linux/mm.h>
44#include <linux/sched/mm.h>
45#include <linux/sched/coredump.h>
46#include <linux/sched/numa_balancing.h>
47#include <linux/sched/task.h>
48#include <linux/hugetlb.h>
49#include <linux/mman.h>
50#include <linux/swap.h>
51#include <linux/highmem.h>
52#include <linux/pagemap.h>
53#include <linux/memremap.h>
54#include <linux/ksm.h>
55#include <linux/rmap.h>
56#include <linux/export.h>
57#include <linux/delayacct.h>
58#include <linux/init.h>
59#include <linux/pfn_t.h>
60#include <linux/writeback.h>
61#include <linux/memcontrol.h>
62#include <linux/mmu_notifier.h>
63#include <linux/swapops.h>
64#include <linux/elf.h>
65#include <linux/gfp.h>
66#include <linux/migrate.h>
67#include <linux/string.h>
68#include <linux/dma-debug.h>
69#include <linux/debugfs.h>
70#include <linux/userfaultfd_k.h>
71#include <linux/dax.h>
72#include <linux/oom.h>
73#include <linux/numa.h>
74
75#include <asm/io.h>
76#include <asm/mmu_context.h>
77#include <asm/pgalloc.h>
78#include <linux/uaccess.h>
79#include <asm/tlb.h>
80#include <asm/tlbflush.h>
81#include <asm/pgtable.h>
82
83#include "internal.h"
84
85#if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST)
86#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
87#endif
88
89#ifndef CONFIG_NEED_MULTIPLE_NODES
90
91unsigned long max_mapnr;
92EXPORT_SYMBOL(max_mapnr);
93
94struct page *mem_map;
95EXPORT_SYMBOL(mem_map);
96#endif
97
98
99
100
101
102
103
104
105void *high_memory;
106EXPORT_SYMBOL(high_memory);
107
108
109
110
111
112
113
114int randomize_va_space __read_mostly =
115#ifdef CONFIG_COMPAT_BRK
116 1;
117#else
118 2;
119#endif
120
121static int __init disable_randmaps(char *s)
122{
123 randomize_va_space = 0;
124 return 1;
125}
126__setup("norandmaps", disable_randmaps);
127
128unsigned long zero_pfn __read_mostly;
129EXPORT_SYMBOL(zero_pfn);
130
131unsigned long highest_memmap_pfn __read_mostly;
132
133
134
135
136static int __init init_zero_pfn(void)
137{
138 zero_pfn = page_to_pfn(ZERO_PAGE(0));
139 return 0;
140}
141core_initcall(init_zero_pfn);
142
143
144#if defined(SPLIT_RSS_COUNTING)
145
146void sync_mm_rss(struct mm_struct *mm)
147{
148 int i;
149
150 for (i = 0; i < NR_MM_COUNTERS; i++) {
151 if (current->rss_stat.count[i]) {
152 add_mm_counter(mm, i, current->rss_stat.count[i]);
153 current->rss_stat.count[i] = 0;
154 }
155 }
156 current->rss_stat.events = 0;
157}
158
159static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
160{
161 struct task_struct *task = current;
162
163 if (likely(task->mm == mm))
164 task->rss_stat.count[member] += val;
165 else
166 add_mm_counter(mm, member, val);
167}
168#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
169#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
170
171
172#define TASK_RSS_EVENTS_THRESH (64)
173static void check_sync_rss_stat(struct task_struct *task)
174{
175 if (unlikely(task != current))
176 return;
177 if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
178 sync_mm_rss(task->mm);
179}
180#else
181
182#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
183#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
184
185static void check_sync_rss_stat(struct task_struct *task)
186{
187}
188
189#endif
190
191
192
193
194
195static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
196 unsigned long addr)
197{
198 pgtable_t token = pmd_pgtable(*pmd);
199 pmd_clear(pmd);
200 pte_free_tlb(tlb, token, addr);
201 mm_dec_nr_ptes(tlb->mm);
202}
203
204static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
205 unsigned long addr, unsigned long end,
206 unsigned long floor, unsigned long ceiling)
207{
208 pmd_t *pmd;
209 unsigned long next;
210 unsigned long start;
211
212 start = addr;
213 pmd = pmd_offset(pud, addr);
214 do {
215 next = pmd_addr_end(addr, end);
216 if (pmd_none_or_clear_bad(pmd))
217 continue;
218 free_pte_range(tlb, pmd, addr);
219 } while (pmd++, addr = next, addr != end);
220
221 start &= PUD_MASK;
222 if (start < floor)
223 return;
224 if (ceiling) {
225 ceiling &= PUD_MASK;
226 if (!ceiling)
227 return;
228 }
229 if (end - 1 > ceiling - 1)
230 return;
231
232 pmd = pmd_offset(pud, start);
233 pud_clear(pud);
234 pmd_free_tlb(tlb, pmd, start);
235 mm_dec_nr_pmds(tlb->mm);
236}
237
238static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
239 unsigned long addr, unsigned long end,
240 unsigned long floor, unsigned long ceiling)
241{
242 pud_t *pud;
243 unsigned long next;
244 unsigned long start;
245
246 start = addr;
247 pud = pud_offset(p4d, addr);
248 do {
249 next = pud_addr_end(addr, end);
250 if (pud_none_or_clear_bad(pud))
251 continue;
252 free_pmd_range(tlb, pud, addr, next, floor, ceiling);
253 } while (pud++, addr = next, addr != end);
254
255 start &= P4D_MASK;
256 if (start < floor)
257 return;
258 if (ceiling) {
259 ceiling &= P4D_MASK;
260 if (!ceiling)
261 return;
262 }
263 if (end - 1 > ceiling - 1)
264 return;
265
266 pud = pud_offset(p4d, start);
267 p4d_clear(p4d);
268 pud_free_tlb(tlb, pud, start);
269 mm_dec_nr_puds(tlb->mm);
270}
271
272static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
273 unsigned long addr, unsigned long end,
274 unsigned long floor, unsigned long ceiling)
275{
276 p4d_t *p4d;
277 unsigned long next;
278 unsigned long start;
279
280 start = addr;
281 p4d = p4d_offset(pgd, addr);
282 do {
283 next = p4d_addr_end(addr, end);
284 if (p4d_none_or_clear_bad(p4d))
285 continue;
286 free_pud_range(tlb, p4d, addr, next, floor, ceiling);
287 } while (p4d++, addr = next, addr != end);
288
289 start &= PGDIR_MASK;
290 if (start < floor)
291 return;
292 if (ceiling) {
293 ceiling &= PGDIR_MASK;
294 if (!ceiling)
295 return;
296 }
297 if (end - 1 > ceiling - 1)
298 return;
299
300 p4d = p4d_offset(pgd, start);
301 pgd_clear(pgd);
302 p4d_free_tlb(tlb, p4d, start);
303}
304
305
306
307
308void free_pgd_range(struct mmu_gather *tlb,
309 unsigned long addr, unsigned long end,
310 unsigned long floor, unsigned long ceiling)
311{
312 pgd_t *pgd;
313 unsigned long next;
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341 addr &= PMD_MASK;
342 if (addr < floor) {
343 addr += PMD_SIZE;
344 if (!addr)
345 return;
346 }
347 if (ceiling) {
348 ceiling &= PMD_MASK;
349 if (!ceiling)
350 return;
351 }
352 if (end - 1 > ceiling - 1)
353 end -= PMD_SIZE;
354 if (addr > end - 1)
355 return;
356
357
358
359
360 tlb_change_page_size(tlb, PAGE_SIZE);
361 pgd = pgd_offset(tlb->mm, addr);
362 do {
363 next = pgd_addr_end(addr, end);
364 if (pgd_none_or_clear_bad(pgd))
365 continue;
366 free_p4d_range(tlb, pgd, addr, next, floor, ceiling);
367 } while (pgd++, addr = next, addr != end);
368}
369
370void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
371 unsigned long floor, unsigned long ceiling)
372{
373 while (vma) {
374 struct vm_area_struct *next = vma->vm_next;
375 unsigned long addr = vma->vm_start;
376
377
378
379
380
381 unlink_anon_vmas(vma);
382 unlink_file_vma(vma);
383
384 if (is_vm_hugetlb_page(vma)) {
385 hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
386 floor, next ? next->vm_start : ceiling);
387 } else {
388
389
390
391 while (next && next->vm_start <= vma->vm_end + PMD_SIZE
392 && !is_vm_hugetlb_page(next)) {
393 vma = next;
394 next = vma->vm_next;
395 unlink_anon_vmas(vma);
396 unlink_file_vma(vma);
397 }
398 free_pgd_range(tlb, addr, vma->vm_end,
399 floor, next ? next->vm_start : ceiling);
400 }
401 vma = next;
402 }
403}
404
405int __pte_alloc(struct mm_struct *mm, pmd_t *pmd)
406{
407 spinlock_t *ptl;
408 pgtable_t new = pte_alloc_one(mm);
409 if (!new)
410 return -ENOMEM;
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425 smp_wmb();
426
427 ptl = pmd_lock(mm, pmd);
428 if (likely(pmd_none(*pmd))) {
429 mm_inc_nr_ptes(mm);
430 pmd_populate(mm, pmd, new);
431 new = NULL;
432 }
433 spin_unlock(ptl);
434 if (new)
435 pte_free(mm, new);
436 return 0;
437}
438
439int __pte_alloc_kernel(pmd_t *pmd)
440{
441 pte_t *new = pte_alloc_one_kernel(&init_mm);
442 if (!new)
443 return -ENOMEM;
444
445 smp_wmb();
446
447 spin_lock(&init_mm.page_table_lock);
448 if (likely(pmd_none(*pmd))) {
449 pmd_populate_kernel(&init_mm, pmd, new);
450 new = NULL;
451 }
452 spin_unlock(&init_mm.page_table_lock);
453 if (new)
454 pte_free_kernel(&init_mm, new);
455 return 0;
456}
457
458static inline void init_rss_vec(int *rss)
459{
460 memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
461}
462
463static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
464{
465 int i;
466
467 if (current->mm == mm)
468 sync_mm_rss(mm);
469 for (i = 0; i < NR_MM_COUNTERS; i++)
470 if (rss[i])
471 add_mm_counter(mm, i, rss[i]);
472}
473
474
475
476
477
478
479
480
481static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
482 pte_t pte, struct page *page)
483{
484 pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
485 p4d_t *p4d = p4d_offset(pgd, addr);
486 pud_t *pud = pud_offset(p4d, addr);
487 pmd_t *pmd = pmd_offset(pud, addr);
488 struct address_space *mapping;
489 pgoff_t index;
490 static unsigned long resume;
491 static unsigned long nr_shown;
492 static unsigned long nr_unshown;
493
494
495
496
497
498 if (nr_shown == 60) {
499 if (time_before(jiffies, resume)) {
500 nr_unshown++;
501 return;
502 }
503 if (nr_unshown) {
504 pr_alert("BUG: Bad page map: %lu messages suppressed\n",
505 nr_unshown);
506 nr_unshown = 0;
507 }
508 nr_shown = 0;
509 }
510 if (nr_shown++ == 0)
511 resume = jiffies + 60 * HZ;
512
513 mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
514 index = linear_page_index(vma, addr);
515
516 pr_alert("BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
517 current->comm,
518 (long long)pte_val(pte), (long long)pmd_val(*pmd));
519 if (page)
520 dump_page(page, "bad pte");
521 pr_alert("addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
522 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
523 pr_alert("file:%pD fault:%ps mmap:%ps readpage:%ps\n",
524 vma->vm_file,
525 vma->vm_ops ? vma->vm_ops->fault : NULL,
526 vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
527 mapping ? mapping->a_ops->readpage : NULL);
528 dump_stack();
529 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
530}
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
575 pte_t pte, bool with_public_device)
576{
577 unsigned long pfn = pte_pfn(pte);
578
579 if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) {
580 if (likely(!pte_special(pte)))
581 goto check_pfn;
582 if (vma->vm_ops && vma->vm_ops->find_special_page)
583 return vma->vm_ops->find_special_page(vma, addr);
584 if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
585 return NULL;
586 if (is_zero_pfn(pfn))
587 return NULL;
588
589
590
591
592
593
594
595
596
597
598
599
600
601 if (likely(pfn <= highest_memmap_pfn)) {
602 struct page *page = pfn_to_page(pfn);
603
604 if (is_device_public_page(page)) {
605 if (with_public_device)
606 return page;
607 return NULL;
608 }
609 }
610
611 if (pte_devmap(pte))
612 return NULL;
613
614 print_bad_pte(vma, addr, pte, NULL);
615 return NULL;
616 }
617
618
619
620 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
621 if (vma->vm_flags & VM_MIXEDMAP) {
622 if (!pfn_valid(pfn))
623 return NULL;
624 goto out;
625 } else {
626 unsigned long off;
627 off = (addr - vma->vm_start) >> PAGE_SHIFT;
628 if (pfn == vma->vm_pgoff + off)
629 return NULL;
630 if (!is_cow_mapping(vma->vm_flags))
631 return NULL;
632 }
633 }
634
635 if (is_zero_pfn(pfn))
636 return NULL;
637
638check_pfn:
639 if (unlikely(pfn > highest_memmap_pfn)) {
640 print_bad_pte(vma, addr, pte, NULL);
641 return NULL;
642 }
643
644
645
646
647
648out:
649 return pfn_to_page(pfn);
650}
651
652#ifdef CONFIG_TRANSPARENT_HUGEPAGE
653struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
654 pmd_t pmd)
655{
656 unsigned long pfn = pmd_pfn(pmd);
657
658
659
660
661
662
663 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
664 if (vma->vm_flags & VM_MIXEDMAP) {
665 if (!pfn_valid(pfn))
666 return NULL;
667 goto out;
668 } else {
669 unsigned long off;
670 off = (addr - vma->vm_start) >> PAGE_SHIFT;
671 if (pfn == vma->vm_pgoff + off)
672 return NULL;
673 if (!is_cow_mapping(vma->vm_flags))
674 return NULL;
675 }
676 }
677
678 if (pmd_devmap(pmd))
679 return NULL;
680 if (is_zero_pfn(pfn))
681 return NULL;
682 if (unlikely(pfn > highest_memmap_pfn))
683 return NULL;
684
685
686
687
688
689out:
690 return pfn_to_page(pfn);
691}
692#endif
693
694
695
696
697
698
699
700static inline unsigned long
701copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
702 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
703 unsigned long addr, int *rss)
704{
705 unsigned long vm_flags = vma->vm_flags;
706 pte_t pte = *src_pte;
707 struct page *page;
708
709
710 if (unlikely(!pte_present(pte))) {
711 swp_entry_t entry = pte_to_swp_entry(pte);
712
713 if (likely(!non_swap_entry(entry))) {
714 if (swap_duplicate(entry) < 0)
715 return entry.val;
716
717
718 if (unlikely(list_empty(&dst_mm->mmlist))) {
719 spin_lock(&mmlist_lock);
720 if (list_empty(&dst_mm->mmlist))
721 list_add(&dst_mm->mmlist,
722 &src_mm->mmlist);
723 spin_unlock(&mmlist_lock);
724 }
725 rss[MM_SWAPENTS]++;
726 } else if (is_migration_entry(entry)) {
727 page = migration_entry_to_page(entry);
728
729 rss[mm_counter(page)]++;
730
731 if (is_write_migration_entry(entry) &&
732 is_cow_mapping(vm_flags)) {
733
734
735
736
737 make_migration_entry_read(&entry);
738 pte = swp_entry_to_pte(entry);
739 if (pte_swp_soft_dirty(*src_pte))
740 pte = pte_swp_mksoft_dirty(pte);
741 set_pte_at(src_mm, addr, src_pte, pte);
742 }
743 } else if (is_device_private_entry(entry)) {
744 page = device_private_entry_to_page(entry);
745
746
747
748
749
750
751
752
753
754
755 get_page(page);
756 rss[mm_counter(page)]++;
757 page_dup_rmap(page, false);
758
759
760
761
762
763
764
765
766 if (is_write_device_private_entry(entry) &&
767 is_cow_mapping(vm_flags)) {
768 make_device_private_entry_read(&entry);
769 pte = swp_entry_to_pte(entry);
770 set_pte_at(src_mm, addr, src_pte, pte);
771 }
772 }
773 goto out_set_pte;
774 }
775
776
777
778
779
780 if (is_cow_mapping(vm_flags) && pte_write(pte)) {
781 ptep_set_wrprotect(src_mm, addr, src_pte);
782 pte = pte_wrprotect(pte);
783 }
784
785
786
787
788
789 if (vm_flags & VM_SHARED)
790 pte = pte_mkclean(pte);
791 pte = pte_mkold(pte);
792
793 page = vm_normal_page(vma, addr, pte);
794 if (page) {
795 get_page(page);
796 page_dup_rmap(page, false);
797 rss[mm_counter(page)]++;
798 } else if (pte_devmap(pte)) {
799 page = pte_page(pte);
800
801
802
803
804
805
806 if (is_device_public_page(page)) {
807 get_page(page);
808 page_dup_rmap(page, false);
809 rss[mm_counter(page)]++;
810 }
811 }
812
813out_set_pte:
814 set_pte_at(dst_mm, addr, dst_pte, pte);
815 return 0;
816}
817
818static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
819 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
820 unsigned long addr, unsigned long end)
821{
822 pte_t *orig_src_pte, *orig_dst_pte;
823 pte_t *src_pte, *dst_pte;
824 spinlock_t *src_ptl, *dst_ptl;
825 int progress = 0;
826 int rss[NR_MM_COUNTERS];
827 swp_entry_t entry = (swp_entry_t){0};
828
829again:
830 init_rss_vec(rss);
831
832 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
833 if (!dst_pte)
834 return -ENOMEM;
835 src_pte = pte_offset_map(src_pmd, addr);
836 src_ptl = pte_lockptr(src_mm, src_pmd);
837 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
838 orig_src_pte = src_pte;
839 orig_dst_pte = dst_pte;
840 arch_enter_lazy_mmu_mode();
841
842 do {
843
844
845
846
847 if (progress >= 32) {
848 progress = 0;
849 if (need_resched() ||
850 spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
851 break;
852 }
853 if (pte_none(*src_pte)) {
854 progress++;
855 continue;
856 }
857 entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
858 vma, addr, rss);
859 if (entry.val)
860 break;
861 progress += 8;
862 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
863
864 arch_leave_lazy_mmu_mode();
865 spin_unlock(src_ptl);
866 pte_unmap(orig_src_pte);
867 add_mm_rss_vec(dst_mm, rss);
868 pte_unmap_unlock(orig_dst_pte, dst_ptl);
869 cond_resched();
870
871 if (entry.val) {
872 if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
873 return -ENOMEM;
874 progress = 0;
875 }
876 if (addr != end)
877 goto again;
878 return 0;
879}
880
881static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
882 pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
883 unsigned long addr, unsigned long end)
884{
885 pmd_t *src_pmd, *dst_pmd;
886 unsigned long next;
887
888 dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
889 if (!dst_pmd)
890 return -ENOMEM;
891 src_pmd = pmd_offset(src_pud, addr);
892 do {
893 next = pmd_addr_end(addr, end);
894 if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)
895 || pmd_devmap(*src_pmd)) {
896 int err;
897 VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, vma);
898 err = copy_huge_pmd(dst_mm, src_mm,
899 dst_pmd, src_pmd, addr, vma);
900 if (err == -ENOMEM)
901 return -ENOMEM;
902 if (!err)
903 continue;
904
905 }
906 if (pmd_none_or_clear_bad(src_pmd))
907 continue;
908 if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
909 vma, addr, next))
910 return -ENOMEM;
911 } while (dst_pmd++, src_pmd++, addr = next, addr != end);
912 return 0;
913}
914
915static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
916 p4d_t *dst_p4d, p4d_t *src_p4d, struct vm_area_struct *vma,
917 unsigned long addr, unsigned long end)
918{
919 pud_t *src_pud, *dst_pud;
920 unsigned long next;
921
922 dst_pud = pud_alloc(dst_mm, dst_p4d, addr);
923 if (!dst_pud)
924 return -ENOMEM;
925 src_pud = pud_offset(src_p4d, addr);
926 do {
927 next = pud_addr_end(addr, end);
928 if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) {
929 int err;
930
931 VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, vma);
932 err = copy_huge_pud(dst_mm, src_mm,
933 dst_pud, src_pud, addr, vma);
934 if (err == -ENOMEM)
935 return -ENOMEM;
936 if (!err)
937 continue;
938
939 }
940 if (pud_none_or_clear_bad(src_pud))
941 continue;
942 if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
943 vma, addr, next))
944 return -ENOMEM;
945 } while (dst_pud++, src_pud++, addr = next, addr != end);
946 return 0;
947}
948
949static inline int copy_p4d_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
950 pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
951 unsigned long addr, unsigned long end)
952{
953 p4d_t *src_p4d, *dst_p4d;
954 unsigned long next;
955
956 dst_p4d = p4d_alloc(dst_mm, dst_pgd, addr);
957 if (!dst_p4d)
958 return -ENOMEM;
959 src_p4d = p4d_offset(src_pgd, addr);
960 do {
961 next = p4d_addr_end(addr, end);
962 if (p4d_none_or_clear_bad(src_p4d))
963 continue;
964 if (copy_pud_range(dst_mm, src_mm, dst_p4d, src_p4d,
965 vma, addr, next))
966 return -ENOMEM;
967 } while (dst_p4d++, src_p4d++, addr = next, addr != end);
968 return 0;
969}
970
971int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
972 struct vm_area_struct *vma)
973{
974 pgd_t *src_pgd, *dst_pgd;
975 unsigned long next;
976 unsigned long addr = vma->vm_start;
977 unsigned long end = vma->vm_end;
978 struct mmu_notifier_range range;
979 bool is_cow;
980 int ret;
981
982
983
984
985
986
987
988 if (!(vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) &&
989 !vma->anon_vma)
990 return 0;
991
992 if (is_vm_hugetlb_page(vma))
993 return copy_hugetlb_page_range(dst_mm, src_mm, vma);
994
995 if (unlikely(vma->vm_flags & VM_PFNMAP)) {
996
997
998
999
1000 ret = track_pfn_copy(vma);
1001 if (ret)
1002 return ret;
1003 }
1004
1005
1006
1007
1008
1009
1010
1011 is_cow = is_cow_mapping(vma->vm_flags);
1012
1013 if (is_cow) {
1014 mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
1015 0, vma, src_mm, addr, end);
1016 mmu_notifier_invalidate_range_start(&range);
1017 }
1018
1019 ret = 0;
1020 dst_pgd = pgd_offset(dst_mm, addr);
1021 src_pgd = pgd_offset(src_mm, addr);
1022 do {
1023 next = pgd_addr_end(addr, end);
1024 if (pgd_none_or_clear_bad(src_pgd))
1025 continue;
1026 if (unlikely(copy_p4d_range(dst_mm, src_mm, dst_pgd, src_pgd,
1027 vma, addr, next))) {
1028 ret = -ENOMEM;
1029 break;
1030 }
1031 } while (dst_pgd++, src_pgd++, addr = next, addr != end);
1032
1033 if (is_cow)
1034 mmu_notifier_invalidate_range_end(&range);
1035 return ret;
1036}
1037
1038static unsigned long zap_pte_range(struct mmu_gather *tlb,
1039 struct vm_area_struct *vma, pmd_t *pmd,
1040 unsigned long addr, unsigned long end,
1041 struct zap_details *details)
1042{
1043 struct mm_struct *mm = tlb->mm;
1044 int force_flush = 0;
1045 int rss[NR_MM_COUNTERS];
1046 spinlock_t *ptl;
1047 pte_t *start_pte;
1048 pte_t *pte;
1049 swp_entry_t entry;
1050
1051 tlb_change_page_size(tlb, PAGE_SIZE);
1052again:
1053 init_rss_vec(rss);
1054 start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
1055 pte = start_pte;
1056 flush_tlb_batched_pending(mm);
1057 arch_enter_lazy_mmu_mode();
1058 do {
1059 pte_t ptent = *pte;
1060 if (pte_none(ptent))
1061 continue;
1062
1063 if (pte_present(ptent)) {
1064 struct page *page;
1065
1066 page = _vm_normal_page(vma, addr, ptent, true);
1067 if (unlikely(details) && page) {
1068
1069
1070
1071
1072
1073 if (details->check_mapping &&
1074 details->check_mapping != page_rmapping(page))
1075 continue;
1076 }
1077 ptent = ptep_get_and_clear_full(mm, addr, pte,
1078 tlb->fullmm);
1079 tlb_remove_tlb_entry(tlb, pte, addr);
1080 if (unlikely(!page))
1081 continue;
1082
1083 if (!PageAnon(page)) {
1084 if (pte_dirty(ptent)) {
1085 force_flush = 1;
1086 set_page_dirty(page);
1087 }
1088 if (pte_young(ptent) &&
1089 likely(!(vma->vm_flags & VM_SEQ_READ)))
1090 mark_page_accessed(page);
1091 }
1092 rss[mm_counter(page)]--;
1093 page_remove_rmap(page, false);
1094 if (unlikely(page_mapcount(page) < 0))
1095 print_bad_pte(vma, addr, ptent, page);
1096 if (unlikely(__tlb_remove_page(tlb, page))) {
1097 force_flush = 1;
1098 addr += PAGE_SIZE;
1099 break;
1100 }
1101 continue;
1102 }
1103
1104 entry = pte_to_swp_entry(ptent);
1105 if (non_swap_entry(entry) && is_device_private_entry(entry)) {
1106 struct page *page = device_private_entry_to_page(entry);
1107
1108 if (unlikely(details && details->check_mapping)) {
1109
1110
1111
1112
1113
1114 if (details->check_mapping !=
1115 page_rmapping(page))
1116 continue;
1117 }
1118
1119 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
1120 rss[mm_counter(page)]--;
1121 page_remove_rmap(page, false);
1122 put_page(page);
1123 continue;
1124 }
1125
1126
1127 if (unlikely(details))
1128 continue;
1129
1130 entry = pte_to_swp_entry(ptent);
1131 if (!non_swap_entry(entry))
1132 rss[MM_SWAPENTS]--;
1133 else if (is_migration_entry(entry)) {
1134 struct page *page;
1135
1136 page = migration_entry_to_page(entry);
1137 rss[mm_counter(page)]--;
1138 }
1139 if (unlikely(!free_swap_and_cache(entry)))
1140 print_bad_pte(vma, addr, ptent, NULL);
1141 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
1142 } while (pte++, addr += PAGE_SIZE, addr != end);
1143
1144 add_mm_rss_vec(mm, rss);
1145 arch_leave_lazy_mmu_mode();
1146
1147
1148 if (force_flush)
1149 tlb_flush_mmu_tlbonly(tlb);
1150 pte_unmap_unlock(start_pte, ptl);
1151
1152
1153
1154
1155
1156
1157
1158 if (force_flush) {
1159 force_flush = 0;
1160 tlb_flush_mmu(tlb);
1161 if (addr != end)
1162 goto again;
1163 }
1164
1165 return addr;
1166}
1167
1168static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1169 struct vm_area_struct *vma, pud_t *pud,
1170 unsigned long addr, unsigned long end,
1171 struct zap_details *details)
1172{
1173 pmd_t *pmd;
1174 unsigned long next;
1175
1176 pmd = pmd_offset(pud, addr);
1177 do {
1178 next = pmd_addr_end(addr, end);
1179 if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
1180 if (next - addr != HPAGE_PMD_SIZE)
1181 __split_huge_pmd(vma, pmd, addr, false, NULL);
1182 else if (zap_huge_pmd(tlb, vma, pmd, addr))
1183 goto next;
1184
1185 }
1186
1187
1188
1189
1190
1191
1192
1193 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
1194 goto next;
1195 next = zap_pte_range(tlb, vma, pmd, addr, next, details);
1196next:
1197 cond_resched();
1198 } while (pmd++, addr = next, addr != end);
1199
1200 return addr;
1201}
1202
1203static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1204 struct vm_area_struct *vma, p4d_t *p4d,
1205 unsigned long addr, unsigned long end,
1206 struct zap_details *details)
1207{
1208 pud_t *pud;
1209 unsigned long next;
1210
1211 pud = pud_offset(p4d, addr);
1212 do {
1213 next = pud_addr_end(addr, end);
1214 if (pud_trans_huge(*pud) || pud_devmap(*pud)) {
1215 if (next - addr != HPAGE_PUD_SIZE) {
1216 VM_BUG_ON_VMA(!rwsem_is_locked(&tlb->mm->mmap_sem), vma);
1217 split_huge_pud(vma, pud, addr);
1218 } else if (zap_huge_pud(tlb, vma, pud, addr))
1219 goto next;
1220
1221 }
1222 if (pud_none_or_clear_bad(pud))
1223 continue;
1224 next = zap_pmd_range(tlb, vma, pud, addr, next, details);
1225next:
1226 cond_resched();
1227 } while (pud++, addr = next, addr != end);
1228
1229 return addr;
1230}
1231
1232static inline unsigned long zap_p4d_range(struct mmu_gather *tlb,
1233 struct vm_area_struct *vma, pgd_t *pgd,
1234 unsigned long addr, unsigned long end,
1235 struct zap_details *details)
1236{
1237 p4d_t *p4d;
1238 unsigned long next;
1239
1240 p4d = p4d_offset(pgd, addr);
1241 do {
1242 next = p4d_addr_end(addr, end);
1243 if (p4d_none_or_clear_bad(p4d))
1244 continue;
1245 next = zap_pud_range(tlb, vma, p4d, addr, next, details);
1246 } while (p4d++, addr = next, addr != end);
1247
1248 return addr;
1249}
1250
1251void unmap_page_range(struct mmu_gather *tlb,
1252 struct vm_area_struct *vma,
1253 unsigned long addr, unsigned long end,
1254 struct zap_details *details)
1255{
1256 pgd_t *pgd;
1257 unsigned long next;
1258
1259 BUG_ON(addr >= end);
1260 tlb_start_vma(tlb, vma);
1261 pgd = pgd_offset(vma->vm_mm, addr);
1262 do {
1263 next = pgd_addr_end(addr, end);
1264 if (pgd_none_or_clear_bad(pgd))
1265 continue;
1266 next = zap_p4d_range(tlb, vma, pgd, addr, next, details);
1267 } while (pgd++, addr = next, addr != end);
1268 tlb_end_vma(tlb, vma);
1269}
1270
1271
1272static void unmap_single_vma(struct mmu_gather *tlb,
1273 struct vm_area_struct *vma, unsigned long start_addr,
1274 unsigned long end_addr,
1275 struct zap_details *details)
1276{
1277 unsigned long start = max(vma->vm_start, start_addr);
1278 unsigned long end;
1279
1280 if (start >= vma->vm_end)
1281 return;
1282 end = min(vma->vm_end, end_addr);
1283 if (end <= vma->vm_start)
1284 return;
1285
1286 if (vma->vm_file)
1287 uprobe_munmap(vma, start, end);
1288
1289 if (unlikely(vma->vm_flags & VM_PFNMAP))
1290 untrack_pfn(vma, 0, 0);
1291
1292 if (start != end) {
1293 if (unlikely(is_vm_hugetlb_page(vma))) {
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305 if (vma->vm_file) {
1306 i_mmap_lock_write(vma->vm_file->f_mapping);
1307 __unmap_hugepage_range_final(tlb, vma, start, end, NULL);
1308 i_mmap_unlock_write(vma->vm_file->f_mapping);
1309 }
1310 } else
1311 unmap_page_range(tlb, vma, start, end, details);
1312 }
1313}
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333void unmap_vmas(struct mmu_gather *tlb,
1334 struct vm_area_struct *vma, unsigned long start_addr,
1335 unsigned long end_addr)
1336{
1337 struct mmu_notifier_range range;
1338
1339 mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
1340 start_addr, end_addr);
1341 mmu_notifier_invalidate_range_start(&range);
1342 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
1343 unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
1344 mmu_notifier_invalidate_range_end(&range);
1345}
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355void zap_page_range(struct vm_area_struct *vma, unsigned long start,
1356 unsigned long size)
1357{
1358 struct mmu_notifier_range range;
1359 struct mmu_gather tlb;
1360
1361 lru_add_drain();
1362 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
1363 start, start + size);
1364 tlb_gather_mmu(&tlb, vma->vm_mm, start, range.end);
1365 update_hiwater_rss(vma->vm_mm);
1366 mmu_notifier_invalidate_range_start(&range);
1367 for ( ; vma && vma->vm_start < range.end; vma = vma->vm_next)
1368 unmap_single_vma(&tlb, vma, start, range.end, NULL);
1369 mmu_notifier_invalidate_range_end(&range);
1370 tlb_finish_mmu(&tlb, start, range.end);
1371}
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
1383 unsigned long size, struct zap_details *details)
1384{
1385 struct mmu_notifier_range range;
1386 struct mmu_gather tlb;
1387
1388 lru_add_drain();
1389 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
1390 address, address + size);
1391 tlb_gather_mmu(&tlb, vma->vm_mm, address, range.end);
1392 update_hiwater_rss(vma->vm_mm);
1393 mmu_notifier_invalidate_range_start(&range);
1394 unmap_single_vma(&tlb, vma, address, range.end, details);
1395 mmu_notifier_invalidate_range_end(&range);
1396 tlb_finish_mmu(&tlb, address, range.end);
1397}
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
1411 unsigned long size)
1412{
1413 if (address < vma->vm_start || address + size > vma->vm_end ||
1414 !(vma->vm_flags & VM_PFNMAP))
1415 return;
1416
1417 zap_page_range_single(vma, address, size, NULL);
1418}
1419EXPORT_SYMBOL_GPL(zap_vma_ptes);
1420
1421pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
1422 spinlock_t **ptl)
1423{
1424 pgd_t *pgd;
1425 p4d_t *p4d;
1426 pud_t *pud;
1427 pmd_t *pmd;
1428
1429 pgd = pgd_offset(mm, addr);
1430 p4d = p4d_alloc(mm, pgd, addr);
1431 if (!p4d)
1432 return NULL;
1433 pud = pud_alloc(mm, p4d, addr);
1434 if (!pud)
1435 return NULL;
1436 pmd = pmd_alloc(mm, pud, addr);
1437 if (!pmd)
1438 return NULL;
1439
1440 VM_BUG_ON(pmd_trans_huge(*pmd));
1441 return pte_alloc_map_lock(mm, pmd, addr, ptl);
1442}
1443
1444
1445
1446
1447
1448
1449
1450
1451static int insert_page(struct vm_area_struct *vma, unsigned long addr,
1452 struct page *page, pgprot_t prot)
1453{
1454 struct mm_struct *mm = vma->vm_mm;
1455 int retval;
1456 pte_t *pte;
1457 spinlock_t *ptl;
1458
1459 retval = -EINVAL;
1460 if (PageAnon(page) || PageSlab(page) || page_has_type(page))
1461 goto out;
1462 retval = -ENOMEM;
1463 flush_dcache_page(page);
1464 pte = get_locked_pte(mm, addr, &ptl);
1465 if (!pte)
1466 goto out;
1467 retval = -EBUSY;
1468 if (!pte_none(*pte))
1469 goto out_unlock;
1470
1471
1472 get_page(page);
1473 inc_mm_counter_fast(mm, mm_counter_file(page));
1474 page_add_file_rmap(page, false);
1475 set_pte_at(mm, addr, pte, mk_pte(page, prot));
1476
1477 retval = 0;
1478 pte_unmap_unlock(pte, ptl);
1479 return retval;
1480out_unlock:
1481 pte_unmap_unlock(pte, ptl);
1482out:
1483 return retval;
1484}
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
1516 struct page *page)
1517{
1518 if (addr < vma->vm_start || addr >= vma->vm_end)
1519 return -EFAULT;
1520 if (!page_count(page))
1521 return -EINVAL;
1522 if (!(vma->vm_flags & VM_MIXEDMAP)) {
1523 BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem));
1524 BUG_ON(vma->vm_flags & VM_PFNMAP);
1525 vma->vm_flags |= VM_MIXEDMAP;
1526 }
1527 return insert_page(vma, addr, page, vma->vm_page_prot);
1528}
1529EXPORT_SYMBOL(vm_insert_page);
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542static int __vm_map_pages(struct vm_area_struct *vma, struct page **pages,
1543 unsigned long num, unsigned long offset)
1544{
1545 unsigned long count = vma_pages(vma);
1546 unsigned long uaddr = vma->vm_start;
1547 int ret, i;
1548
1549
1550 if (offset > num)
1551 return -ENXIO;
1552
1553
1554 if (count > num - offset)
1555 return -ENXIO;
1556
1557 for (i = 0; i < count; i++) {
1558 ret = vm_insert_page(vma, uaddr, pages[offset + i]);
1559 if (ret < 0)
1560 return ret;
1561 uaddr += PAGE_SIZE;
1562 }
1563
1564 return 0;
1565}
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
1586 unsigned long num)
1587{
1588 return __vm_map_pages(vma, pages, num, vma->vm_pgoff);
1589}
1590EXPORT_SYMBOL(vm_map_pages);
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
1606 unsigned long num)
1607{
1608 return __vm_map_pages(vma, pages, num, 0);
1609}
1610EXPORT_SYMBOL(vm_map_pages_zero);
1611
1612static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1613 pfn_t pfn, pgprot_t prot, bool mkwrite)
1614{
1615 struct mm_struct *mm = vma->vm_mm;
1616 pte_t *pte, entry;
1617 spinlock_t *ptl;
1618
1619 pte = get_locked_pte(mm, addr, &ptl);
1620 if (!pte)
1621 return VM_FAULT_OOM;
1622 if (!pte_none(*pte)) {
1623 if (mkwrite) {
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634 if (pte_pfn(*pte) != pfn_t_to_pfn(pfn)) {
1635 WARN_ON_ONCE(!is_zero_pfn(pte_pfn(*pte)));
1636 goto out_unlock;
1637 }
1638 entry = pte_mkyoung(*pte);
1639 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1640 if (ptep_set_access_flags(vma, addr, pte, entry, 1))
1641 update_mmu_cache(vma, addr, pte);
1642 }
1643 goto out_unlock;
1644 }
1645
1646
1647 if (pfn_t_devmap(pfn))
1648 entry = pte_mkdevmap(pfn_t_pte(pfn, prot));
1649 else
1650 entry = pte_mkspecial(pfn_t_pte(pfn, prot));
1651
1652 if (mkwrite) {
1653 entry = pte_mkyoung(entry);
1654 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1655 }
1656
1657 set_pte_at(mm, addr, pte, entry);
1658 update_mmu_cache(vma, addr, pte);
1659
1660out_unlock:
1661 pte_unmap_unlock(pte, ptl);
1662 return VM_FAULT_NOPAGE;
1663}
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
1684 unsigned long pfn, pgprot_t pgprot)
1685{
1686
1687
1688
1689
1690
1691
1692 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
1693 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
1694 (VM_PFNMAP|VM_MIXEDMAP));
1695 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
1696 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
1697
1698 if (addr < vma->vm_start || addr >= vma->vm_end)
1699 return VM_FAULT_SIGBUS;
1700
1701 if (!pfn_modify_allowed(pfn, pgprot))
1702 return VM_FAULT_SIGBUS;
1703
1704 track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
1705
1706 return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
1707 false);
1708}
1709EXPORT_SYMBOL(vmf_insert_pfn_prot);
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1732 unsigned long pfn)
1733{
1734 return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
1735}
1736EXPORT_SYMBOL(vmf_insert_pfn);
1737
1738static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
1739{
1740
1741 if (vma->vm_flags & VM_MIXEDMAP)
1742 return true;
1743 if (pfn_t_devmap(pfn))
1744 return true;
1745 if (pfn_t_special(pfn))
1746 return true;
1747 if (is_zero_pfn(pfn_t_to_pfn(pfn)))
1748 return true;
1749 return false;
1750}
1751
1752static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
1753 unsigned long addr, pfn_t pfn, bool mkwrite)
1754{
1755 pgprot_t pgprot = vma->vm_page_prot;
1756 int err;
1757
1758 BUG_ON(!vm_mixed_ok(vma, pfn));
1759
1760 if (addr < vma->vm_start || addr >= vma->vm_end)
1761 return VM_FAULT_SIGBUS;
1762
1763 track_pfn_insert(vma, &pgprot, pfn);
1764
1765 if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot))
1766 return VM_FAULT_SIGBUS;
1767
1768
1769
1770
1771
1772
1773
1774
1775 if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) &&
1776 !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) {
1777 struct page *page;
1778
1779
1780
1781
1782
1783
1784 page = pfn_to_page(pfn_t_to_pfn(pfn));
1785 err = insert_page(vma, addr, page, pgprot);
1786 } else {
1787 return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
1788 }
1789
1790 if (err == -ENOMEM)
1791 return VM_FAULT_OOM;
1792 if (err < 0 && err != -EBUSY)
1793 return VM_FAULT_SIGBUS;
1794
1795 return VM_FAULT_NOPAGE;
1796}
1797
1798vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
1799 pfn_t pfn)
1800{
1801 return __vm_insert_mixed(vma, addr, pfn, false);
1802}
1803EXPORT_SYMBOL(vmf_insert_mixed);
1804
1805
1806
1807
1808
1809
1810vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
1811 unsigned long addr, pfn_t pfn)
1812{
1813 return __vm_insert_mixed(vma, addr, pfn, true);
1814}
1815EXPORT_SYMBOL(vmf_insert_mixed_mkwrite);
1816
1817
1818
1819
1820
1821
1822static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1823 unsigned long addr, unsigned long end,
1824 unsigned long pfn, pgprot_t prot)
1825{
1826 pte_t *pte;
1827 spinlock_t *ptl;
1828 int err = 0;
1829
1830 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
1831 if (!pte)
1832 return -ENOMEM;
1833 arch_enter_lazy_mmu_mode();
1834 do {
1835 BUG_ON(!pte_none(*pte));
1836 if (!pfn_modify_allowed(pfn, prot)) {
1837 err = -EACCES;
1838 break;
1839 }
1840 set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
1841 pfn++;
1842 } while (pte++, addr += PAGE_SIZE, addr != end);
1843 arch_leave_lazy_mmu_mode();
1844 pte_unmap_unlock(pte - 1, ptl);
1845 return err;
1846}
1847
1848static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
1849 unsigned long addr, unsigned long end,
1850 unsigned long pfn, pgprot_t prot)
1851{
1852 pmd_t *pmd;
1853 unsigned long next;
1854 int err;
1855
1856 pfn -= addr >> PAGE_SHIFT;
1857 pmd = pmd_alloc(mm, pud, addr);
1858 if (!pmd)
1859 return -ENOMEM;
1860 VM_BUG_ON(pmd_trans_huge(*pmd));
1861 do {
1862 next = pmd_addr_end(addr, end);
1863 err = remap_pte_range(mm, pmd, addr, next,
1864 pfn + (addr >> PAGE_SHIFT), prot);
1865 if (err)
1866 return err;
1867 } while (pmd++, addr = next, addr != end);
1868 return 0;
1869}
1870
1871static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d,
1872 unsigned long addr, unsigned long end,
1873 unsigned long pfn, pgprot_t prot)
1874{
1875 pud_t *pud;
1876 unsigned long next;
1877 int err;
1878
1879 pfn -= addr >> PAGE_SHIFT;
1880 pud = pud_alloc(mm, p4d, addr);
1881 if (!pud)
1882 return -ENOMEM;
1883 do {
1884 next = pud_addr_end(addr, end);
1885 err = remap_pmd_range(mm, pud, addr, next,
1886 pfn + (addr >> PAGE_SHIFT), prot);
1887 if (err)
1888 return err;
1889 } while (pud++, addr = next, addr != end);
1890 return 0;
1891}
1892
1893static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
1894 unsigned long addr, unsigned long end,
1895 unsigned long pfn, pgprot_t prot)
1896{
1897 p4d_t *p4d;
1898 unsigned long next;
1899 int err;
1900
1901 pfn -= addr >> PAGE_SHIFT;
1902 p4d = p4d_alloc(mm, pgd, addr);
1903 if (!p4d)
1904 return -ENOMEM;
1905 do {
1906 next = p4d_addr_end(addr, end);
1907 err = remap_pud_range(mm, p4d, addr, next,
1908 pfn + (addr >> PAGE_SHIFT), prot);
1909 if (err)
1910 return err;
1911 } while (p4d++, addr = next, addr != end);
1912 return 0;
1913}
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1928 unsigned long pfn, unsigned long size, pgprot_t prot)
1929{
1930 pgd_t *pgd;
1931 unsigned long next;
1932 unsigned long end = addr + PAGE_ALIGN(size);
1933 struct mm_struct *mm = vma->vm_mm;
1934 unsigned long remap_pfn = pfn;
1935 int err;
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955 if (is_cow_mapping(vma->vm_flags)) {
1956 if (addr != vma->vm_start || end != vma->vm_end)
1957 return -EINVAL;
1958 vma->vm_pgoff = pfn;
1959 }
1960
1961 err = track_pfn_remap(vma, &prot, remap_pfn, addr, PAGE_ALIGN(size));
1962 if (err)
1963 return -EINVAL;
1964
1965 vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
1966
1967 BUG_ON(addr >= end);
1968 pfn -= addr >> PAGE_SHIFT;
1969 pgd = pgd_offset(mm, addr);
1970 flush_cache_range(vma, addr, end);
1971 do {
1972 next = pgd_addr_end(addr, end);
1973 err = remap_p4d_range(mm, pgd, addr, next,
1974 pfn + (addr >> PAGE_SHIFT), prot);
1975 if (err)
1976 break;
1977 } while (pgd++, addr = next, addr != end);
1978
1979 if (err)
1980 untrack_pfn(vma, remap_pfn, PAGE_ALIGN(size));
1981
1982 return err;
1983}
1984EXPORT_SYMBOL(remap_pfn_range);
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
2002{
2003 unsigned long vm_len, pfn, pages;
2004
2005
2006 if (start + len < start)
2007 return -EINVAL;
2008
2009
2010
2011
2012
2013 len += start & ~PAGE_MASK;
2014 pfn = start >> PAGE_SHIFT;
2015 pages = (len + ~PAGE_MASK) >> PAGE_SHIFT;
2016 if (pfn + pages < pfn)
2017 return -EINVAL;
2018
2019
2020 if (vma->vm_pgoff > pages)
2021 return -EINVAL;
2022 pfn += vma->vm_pgoff;
2023 pages -= vma->vm_pgoff;
2024
2025
2026 vm_len = vma->vm_end - vma->vm_start;
2027 if (vm_len >> PAGE_SHIFT > pages)
2028 return -EINVAL;
2029
2030
2031 return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
2032}
2033EXPORT_SYMBOL(vm_iomap_memory);
2034
2035static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
2036 unsigned long addr, unsigned long end,
2037 pte_fn_t fn, void *data)
2038{
2039 pte_t *pte;
2040 int err;
2041 pgtable_t token;
2042 spinlock_t *uninitialized_var(ptl);
2043
2044 pte = (mm == &init_mm) ?
2045 pte_alloc_kernel(pmd, addr) :
2046 pte_alloc_map_lock(mm, pmd, addr, &ptl);
2047 if (!pte)
2048 return -ENOMEM;
2049
2050 BUG_ON(pmd_huge(*pmd));
2051
2052 arch_enter_lazy_mmu_mode();
2053
2054 token = pmd_pgtable(*pmd);
2055
2056 do {
2057 err = fn(pte++, token, addr, data);
2058 if (err)
2059 break;
2060 } while (addr += PAGE_SIZE, addr != end);
2061
2062 arch_leave_lazy_mmu_mode();
2063
2064 if (mm != &init_mm)
2065 pte_unmap_unlock(pte-1, ptl);
2066 return err;
2067}
2068
2069static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
2070 unsigned long addr, unsigned long end,
2071 pte_fn_t fn, void *data)
2072{
2073 pmd_t *pmd;
2074 unsigned long next;
2075 int err;
2076
2077 BUG_ON(pud_huge(*pud));
2078
2079 pmd = pmd_alloc(mm, pud, addr);
2080 if (!pmd)
2081 return -ENOMEM;
2082 do {
2083 next = pmd_addr_end(addr, end);
2084 err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
2085 if (err)
2086 break;
2087 } while (pmd++, addr = next, addr != end);
2088 return err;
2089}
2090
2091static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
2092 unsigned long addr, unsigned long end,
2093 pte_fn_t fn, void *data)
2094{
2095 pud_t *pud;
2096 unsigned long next;
2097 int err;
2098
2099 pud = pud_alloc(mm, p4d, addr);
2100 if (!pud)
2101 return -ENOMEM;
2102 do {
2103 next = pud_addr_end(addr, end);
2104 err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
2105 if (err)
2106 break;
2107 } while (pud++, addr = next, addr != end);
2108 return err;
2109}
2110
2111static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
2112 unsigned long addr, unsigned long end,
2113 pte_fn_t fn, void *data)
2114{
2115 p4d_t *p4d;
2116 unsigned long next;
2117 int err;
2118
2119 p4d = p4d_alloc(mm, pgd, addr);
2120 if (!p4d)
2121 return -ENOMEM;
2122 do {
2123 next = p4d_addr_end(addr, end);
2124 err = apply_to_pud_range(mm, p4d, addr, next, fn, data);
2125 if (err)
2126 break;
2127 } while (p4d++, addr = next, addr != end);
2128 return err;
2129}
2130
2131
2132
2133
2134
2135int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
2136 unsigned long size, pte_fn_t fn, void *data)
2137{
2138 pgd_t *pgd;
2139 unsigned long next;
2140 unsigned long end = addr + size;
2141 int err;
2142
2143 if (WARN_ON(addr >= end))
2144 return -EINVAL;
2145
2146 pgd = pgd_offset(mm, addr);
2147 do {
2148 next = pgd_addr_end(addr, end);
2149 err = apply_to_p4d_range(mm, pgd, addr, next, fn, data);
2150 if (err)
2151 break;
2152 } while (pgd++, addr = next, addr != end);
2153
2154 return err;
2155}
2156EXPORT_SYMBOL_GPL(apply_to_page_range);
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
2167 pte_t *page_table, pte_t orig_pte)
2168{
2169 int same = 1;
2170#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
2171 if (sizeof(pte_t) > sizeof(unsigned long)) {
2172 spinlock_t *ptl = pte_lockptr(mm, pmd);
2173 spin_lock(ptl);
2174 same = pte_same(*page_table, orig_pte);
2175 spin_unlock(ptl);
2176 }
2177#endif
2178 pte_unmap(page_table);
2179 return same;
2180}
2181
2182static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
2183{
2184 debug_dma_assert_idle(src);
2185
2186
2187
2188
2189
2190
2191
2192 if (unlikely(!src)) {
2193 void *kaddr = kmap_atomic(dst);
2194 void __user *uaddr = (void __user *)(va & PAGE_MASK);
2195
2196
2197
2198
2199
2200
2201
2202 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
2203 clear_page(kaddr);
2204 kunmap_atomic(kaddr);
2205 flush_dcache_page(dst);
2206 } else
2207 copy_user_highpage(dst, src, va, vma);
2208}
2209
2210static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
2211{
2212 struct file *vm_file = vma->vm_file;
2213
2214 if (vm_file)
2215 return mapping_gfp_mask(vm_file->f_mapping) | __GFP_FS | __GFP_IO;
2216
2217
2218
2219
2220
2221 return GFP_KERNEL;
2222}
2223
2224
2225
2226
2227
2228
2229
2230static vm_fault_t do_page_mkwrite(struct vm_fault *vmf)
2231{
2232 vm_fault_t ret;
2233 struct page *page = vmf->page;
2234 unsigned int old_flags = vmf->flags;
2235
2236 vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
2237
2238 ret = vmf->vma->vm_ops->page_mkwrite(vmf);
2239
2240 vmf->flags = old_flags;
2241 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
2242 return ret;
2243 if (unlikely(!(ret & VM_FAULT_LOCKED))) {
2244 lock_page(page);
2245 if (!page->mapping) {
2246 unlock_page(page);
2247 return 0;
2248 }
2249 ret |= VM_FAULT_LOCKED;
2250 } else
2251 VM_BUG_ON_PAGE(!PageLocked(page), page);
2252 return ret;
2253}
2254
2255
2256
2257
2258
2259
2260static void fault_dirty_shared_page(struct vm_area_struct *vma,
2261 struct page *page)
2262{
2263 struct address_space *mapping;
2264 bool dirtied;
2265 bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;
2266
2267 dirtied = set_page_dirty(page);
2268 VM_BUG_ON_PAGE(PageAnon(page), page);
2269
2270
2271
2272
2273
2274
2275 mapping = page_rmapping(page);
2276 unlock_page(page);
2277
2278 if ((dirtied || page_mkwrite) && mapping) {
2279
2280
2281
2282
2283 balance_dirty_pages_ratelimited(mapping);
2284 }
2285
2286 if (!page_mkwrite)
2287 file_update_time(vma->vm_file);
2288}
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298static inline void wp_page_reuse(struct vm_fault *vmf)
2299 __releases(vmf->ptl)
2300{
2301 struct vm_area_struct *vma = vmf->vma;
2302 struct page *page = vmf->page;
2303 pte_t entry;
2304
2305
2306
2307
2308
2309 if (page)
2310 page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);
2311
2312 flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
2313 entry = pte_mkyoung(vmf->orig_pte);
2314 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2315 if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
2316 update_mmu_cache(vma, vmf->address, vmf->pte);
2317 pte_unmap_unlock(vmf->pte, vmf->ptl);
2318}
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336static vm_fault_t wp_page_copy(struct vm_fault *vmf)
2337{
2338 struct vm_area_struct *vma = vmf->vma;
2339 struct mm_struct *mm = vma->vm_mm;
2340 struct page *old_page = vmf->page;
2341 struct page *new_page = NULL;
2342 pte_t entry;
2343 int page_copied = 0;
2344 struct mem_cgroup *memcg;
2345 struct mmu_notifier_range range;
2346
2347 if (unlikely(anon_vma_prepare(vma)))
2348 goto oom;
2349
2350 if (is_zero_pfn(pte_pfn(vmf->orig_pte))) {
2351 new_page = alloc_zeroed_user_highpage_movable(vma,
2352 vmf->address);
2353 if (!new_page)
2354 goto oom;
2355 } else {
2356 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
2357 vmf->address);
2358 if (!new_page)
2359 goto oom;
2360 cow_user_page(new_page, old_page, vmf->address, vma);
2361 }
2362
2363 if (mem_cgroup_try_charge_delay(new_page, mm, GFP_KERNEL, &memcg, false))
2364 goto oom_free_new;
2365
2366 __SetPageUptodate(new_page);
2367
2368 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
2369 vmf->address & PAGE_MASK,
2370 (vmf->address & PAGE_MASK) + PAGE_SIZE);
2371 mmu_notifier_invalidate_range_start(&range);
2372
2373
2374
2375
2376 vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
2377 if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
2378 if (old_page) {
2379 if (!PageAnon(old_page)) {
2380 dec_mm_counter_fast(mm,
2381 mm_counter_file(old_page));
2382 inc_mm_counter_fast(mm, MM_ANONPAGES);
2383 }
2384 } else {
2385 inc_mm_counter_fast(mm, MM_ANONPAGES);
2386 }
2387 flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
2388 entry = mk_pte(new_page, vma->vm_page_prot);
2389 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2390
2391
2392
2393
2394
2395
2396 ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
2397 page_add_new_anon_rmap(new_page, vma, vmf->address, false);
2398 mem_cgroup_commit_charge(new_page, memcg, false, false);
2399 lru_cache_add_active_or_unevictable(new_page, vma);
2400
2401
2402
2403
2404
2405 set_pte_at_notify(mm, vmf->address, vmf->pte, entry);
2406 update_mmu_cache(vma, vmf->address, vmf->pte);
2407 if (old_page) {
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430 page_remove_rmap(old_page, false);
2431 }
2432
2433
2434 new_page = old_page;
2435 page_copied = 1;
2436 } else {
2437 mem_cgroup_cancel_charge(new_page, memcg, false);
2438 }
2439
2440 if (new_page)
2441 put_page(new_page);
2442
2443 pte_unmap_unlock(vmf->pte, vmf->ptl);
2444
2445
2446
2447
2448 mmu_notifier_invalidate_range_only_end(&range);
2449 if (old_page) {
2450
2451
2452
2453
2454 if (page_copied && (vma->vm_flags & VM_LOCKED)) {
2455 lock_page(old_page);
2456 if (PageMlocked(old_page))
2457 munlock_vma_page(old_page);
2458 unlock_page(old_page);
2459 }
2460 put_page(old_page);
2461 }
2462 return page_copied ? VM_FAULT_WRITE : 0;
2463oom_free_new:
2464 put_page(new_page);
2465oom:
2466 if (old_page)
2467 put_page(old_page);
2468 return VM_FAULT_OOM;
2469}
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf)
2488{
2489 WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
2490 vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
2491 &vmf->ptl);
2492
2493
2494
2495
2496 if (!pte_same(*vmf->pte, vmf->orig_pte)) {
2497 pte_unmap_unlock(vmf->pte, vmf->ptl);
2498 return VM_FAULT_NOPAGE;
2499 }
2500 wp_page_reuse(vmf);
2501 return 0;
2502}
2503
2504
2505
2506
2507
2508static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
2509{
2510 struct vm_area_struct *vma = vmf->vma;
2511
2512 if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
2513 vm_fault_t ret;
2514
2515 pte_unmap_unlock(vmf->pte, vmf->ptl);
2516 vmf->flags |= FAULT_FLAG_MKWRITE;
2517 ret = vma->vm_ops->pfn_mkwrite(vmf);
2518 if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
2519 return ret;
2520 return finish_mkwrite_fault(vmf);
2521 }
2522 wp_page_reuse(vmf);
2523 return VM_FAULT_WRITE;
2524}
2525
2526static vm_fault_t wp_page_shared(struct vm_fault *vmf)
2527 __releases(vmf->ptl)
2528{
2529 struct vm_area_struct *vma = vmf->vma;
2530
2531 get_page(vmf->page);
2532
2533 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
2534 vm_fault_t tmp;
2535
2536 pte_unmap_unlock(vmf->pte, vmf->ptl);
2537 tmp = do_page_mkwrite(vmf);
2538 if (unlikely(!tmp || (tmp &
2539 (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
2540 put_page(vmf->page);
2541 return tmp;
2542 }
2543 tmp = finish_mkwrite_fault(vmf);
2544 if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
2545 unlock_page(vmf->page);
2546 put_page(vmf->page);
2547 return tmp;
2548 }
2549 } else {
2550 wp_page_reuse(vmf);
2551 lock_page(vmf->page);
2552 }
2553 fault_dirty_shared_page(vma, vmf->page);
2554 put_page(vmf->page);
2555
2556 return VM_FAULT_WRITE;
2557}
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577static vm_fault_t do_wp_page(struct vm_fault *vmf)
2578 __releases(vmf->ptl)
2579{
2580 struct vm_area_struct *vma = vmf->vma;
2581
2582 vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
2583 if (!vmf->page) {
2584
2585
2586
2587
2588
2589
2590
2591 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2592 (VM_WRITE|VM_SHARED))
2593 return wp_pfn_shared(vmf);
2594
2595 pte_unmap_unlock(vmf->pte, vmf->ptl);
2596 return wp_page_copy(vmf);
2597 }
2598
2599
2600
2601
2602
2603 if (PageAnon(vmf->page)) {
2604 int total_map_swapcount;
2605 if (PageKsm(vmf->page) && (PageSwapCache(vmf->page) ||
2606 page_count(vmf->page) != 1))
2607 goto copy;
2608 if (!trylock_page(vmf->page)) {
2609 get_page(vmf->page);
2610 pte_unmap_unlock(vmf->pte, vmf->ptl);
2611 lock_page(vmf->page);
2612 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
2613 vmf->address, &vmf->ptl);
2614 if (!pte_same(*vmf->pte, vmf->orig_pte)) {
2615 unlock_page(vmf->page);
2616 pte_unmap_unlock(vmf->pte, vmf->ptl);
2617 put_page(vmf->page);
2618 return 0;
2619 }
2620 put_page(vmf->page);
2621 }
2622 if (PageKsm(vmf->page)) {
2623 bool reused = reuse_ksm_page(vmf->page, vmf->vma,
2624 vmf->address);
2625 unlock_page(vmf->page);
2626 if (!reused)
2627 goto copy;
2628 wp_page_reuse(vmf);
2629 return VM_FAULT_WRITE;
2630 }
2631 if (reuse_swap_page(vmf->page, &total_map_swapcount)) {
2632 if (total_map_swapcount == 1) {
2633
2634
2635
2636
2637
2638
2639
2640 page_move_anon_rmap(vmf->page, vma);
2641 }
2642 unlock_page(vmf->page);
2643 wp_page_reuse(vmf);
2644 return VM_FAULT_WRITE;
2645 }
2646 unlock_page(vmf->page);
2647 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2648 (VM_WRITE|VM_SHARED))) {
2649 return wp_page_shared(vmf);
2650 }
2651copy:
2652
2653
2654
2655 get_page(vmf->page);
2656
2657 pte_unmap_unlock(vmf->pte, vmf->ptl);
2658 return wp_page_copy(vmf);
2659}
2660
2661static void unmap_mapping_range_vma(struct vm_area_struct *vma,
2662 unsigned long start_addr, unsigned long end_addr,
2663 struct zap_details *details)
2664{
2665 zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
2666}
2667
2668static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
2669 struct zap_details *details)
2670{
2671 struct vm_area_struct *vma;
2672 pgoff_t vba, vea, zba, zea;
2673
2674 vma_interval_tree_foreach(vma, root,
2675 details->first_index, details->last_index) {
2676
2677 vba = vma->vm_pgoff;
2678 vea = vba + vma_pages(vma) - 1;
2679 zba = details->first_index;
2680 if (zba < vba)
2681 zba = vba;
2682 zea = details->last_index;
2683 if (zea > vea)
2684 zea = vea;
2685
2686 unmap_mapping_range_vma(vma,
2687 ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
2688 ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
2689 details);
2690 }
2691}
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705void unmap_mapping_pages(struct address_space *mapping, pgoff_t start,
2706 pgoff_t nr, bool even_cows)
2707{
2708 struct zap_details details = { };
2709
2710 details.check_mapping = even_cows ? NULL : mapping;
2711 details.first_index = start;
2712 details.last_index = start + nr - 1;
2713 if (details.last_index < details.first_index)
2714 details.last_index = ULONG_MAX;
2715
2716 i_mmap_lock_write(mapping);
2717 if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
2718 unmap_mapping_range_tree(&mapping->i_mmap, &details);
2719 i_mmap_unlock_write(mapping);
2720}
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739void unmap_mapping_range(struct address_space *mapping,
2740 loff_t const holebegin, loff_t const holelen, int even_cows)
2741{
2742 pgoff_t hba = holebegin >> PAGE_SHIFT;
2743 pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2744
2745
2746 if (sizeof(holelen) > sizeof(hlen)) {
2747 long long holeend =
2748 (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2749 if (holeend & ~(long long)ULONG_MAX)
2750 hlen = ULONG_MAX - hba + 1;
2751 }
2752
2753 unmap_mapping_pages(mapping, hba, hlen, even_cows);
2754}
2755EXPORT_SYMBOL(unmap_mapping_range);
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765vm_fault_t do_swap_page(struct vm_fault *vmf)
2766{
2767 struct vm_area_struct *vma = vmf->vma;
2768 struct page *page = NULL, *swapcache;
2769 struct mem_cgroup *memcg;
2770 swp_entry_t entry;
2771 pte_t pte;
2772 int locked;
2773 int exclusive = 0;
2774 vm_fault_t ret = 0;
2775
2776 if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
2777 goto out;
2778
2779 entry = pte_to_swp_entry(vmf->orig_pte);
2780 if (unlikely(non_swap_entry(entry))) {
2781 if (is_migration_entry(entry)) {
2782 migration_entry_wait(vma->vm_mm, vmf->pmd,
2783 vmf->address);
2784 } else if (is_device_private_entry(entry)) {
2785
2786
2787
2788
2789
2790 ret = device_private_entry_fault(vma, vmf->address, entry,
2791 vmf->flags, vmf->pmd);
2792 } else if (is_hwpoison_entry(entry)) {
2793 ret = VM_FAULT_HWPOISON;
2794 } else {
2795 print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
2796 ret = VM_FAULT_SIGBUS;
2797 }
2798 goto out;
2799 }
2800
2801
2802 delayacct_set_flag(DELAYACCT_PF_SWAPIN);
2803 page = lookup_swap_cache(entry, vma, vmf->address);
2804 swapcache = page;
2805
2806 if (!page) {
2807 struct swap_info_struct *si = swp_swap_info(entry);
2808
2809 if (si->flags & SWP_SYNCHRONOUS_IO &&
2810 __swap_count(si, entry) == 1) {
2811
2812 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
2813 vmf->address);
2814 if (page) {
2815 __SetPageLocked(page);
2816 __SetPageSwapBacked(page);
2817 set_page_private(page, entry.val);
2818 lru_cache_add_anon(page);
2819 swap_readpage(page, true);
2820 }
2821 } else {
2822 page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
2823 vmf);
2824 swapcache = page;
2825 }
2826
2827 if (!page) {
2828
2829
2830
2831
2832 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
2833 vmf->address, &vmf->ptl);
2834 if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
2835 ret = VM_FAULT_OOM;
2836 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2837 goto unlock;
2838 }
2839
2840
2841 ret = VM_FAULT_MAJOR;
2842 count_vm_event(PGMAJFAULT);
2843 count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
2844 } else if (PageHWPoison(page)) {
2845
2846
2847
2848
2849 ret = VM_FAULT_HWPOISON;
2850 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2851 goto out_release;
2852 }
2853
2854 locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
2855
2856 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2857 if (!locked) {
2858 ret |= VM_FAULT_RETRY;
2859 goto out_release;
2860 }
2861
2862
2863
2864
2865
2866
2867
2868 if (unlikely((!PageSwapCache(page) ||
2869 page_private(page) != entry.val)) && swapcache)
2870 goto out_page;
2871
2872 page = ksm_might_need_to_copy(page, vma, vmf->address);
2873 if (unlikely(!page)) {
2874 ret = VM_FAULT_OOM;
2875 page = swapcache;
2876 goto out_page;
2877 }
2878
2879 if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL,
2880 &memcg, false)) {
2881 ret = VM_FAULT_OOM;
2882 goto out_page;
2883 }
2884
2885
2886
2887
2888 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
2889 &vmf->ptl);
2890 if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte)))
2891 goto out_nomap;
2892
2893 if (unlikely(!PageUptodate(page))) {
2894 ret = VM_FAULT_SIGBUS;
2895 goto out_nomap;
2896 }
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
2909 dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
2910 pte = mk_pte(page, vma->vm_page_prot);
2911 if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
2912 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
2913 vmf->flags &= ~FAULT_FLAG_WRITE;
2914 ret |= VM_FAULT_WRITE;
2915 exclusive = RMAP_EXCLUSIVE;
2916 }
2917 flush_icache_page(vma, page);
2918 if (pte_swp_soft_dirty(vmf->orig_pte))
2919 pte = pte_mksoft_dirty(pte);
2920 set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
2921 arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
2922 vmf->orig_pte = pte;
2923
2924
2925 if (unlikely(page != swapcache && swapcache)) {
2926 page_add_new_anon_rmap(page, vma, vmf->address, false);
2927 mem_cgroup_commit_charge(page, memcg, false, false);
2928 lru_cache_add_active_or_unevictable(page, vma);
2929 } else {
2930 do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
2931 mem_cgroup_commit_charge(page, memcg, true, false);
2932 activate_page(page);
2933 }
2934
2935 swap_free(entry);
2936 if (mem_cgroup_swap_full(page) ||
2937 (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
2938 try_to_free_swap(page);
2939 unlock_page(page);
2940 if (page != swapcache && swapcache) {
2941
2942
2943
2944
2945
2946
2947
2948
2949 unlock_page(swapcache);
2950 put_page(swapcache);
2951 }
2952
2953 if (vmf->flags & FAULT_FLAG_WRITE) {
2954 ret |= do_wp_page(vmf);
2955 if (ret & VM_FAULT_ERROR)
2956 ret &= VM_FAULT_ERROR;
2957 goto out;
2958 }
2959
2960
2961 update_mmu_cache(vma, vmf->address, vmf->pte);
2962unlock:
2963 pte_unmap_unlock(vmf->pte, vmf->ptl);
2964out:
2965 return ret;
2966out_nomap:
2967 mem_cgroup_cancel_charge(page, memcg, false);
2968 pte_unmap_unlock(vmf->pte, vmf->ptl);
2969out_page:
2970 unlock_page(page);
2971out_release:
2972 put_page(page);
2973 if (page != swapcache && swapcache) {
2974 unlock_page(swapcache);
2975 put_page(swapcache);
2976 }
2977 return ret;
2978}
2979
2980
2981
2982
2983
2984
2985static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
2986{
2987 struct vm_area_struct *vma = vmf->vma;
2988 struct mem_cgroup *memcg;
2989 struct page *page;
2990 vm_fault_t ret = 0;
2991 pte_t entry;
2992
2993
2994 if (vma->vm_flags & VM_SHARED)
2995 return VM_FAULT_SIGBUS;
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007 if (pte_alloc(vma->vm_mm, vmf->pmd))
3008 return VM_FAULT_OOM;
3009
3010
3011 if (unlikely(pmd_trans_unstable(vmf->pmd)))
3012 return 0;
3013
3014
3015 if (!(vmf->flags & FAULT_FLAG_WRITE) &&
3016 !mm_forbids_zeropage(vma->vm_mm)) {
3017 entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address),
3018 vma->vm_page_prot));
3019 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
3020 vmf->address, &vmf->ptl);
3021 if (!pte_none(*vmf->pte))
3022 goto unlock;
3023 ret = check_stable_address_space(vma->vm_mm);
3024 if (ret)
3025 goto unlock;
3026
3027 if (userfaultfd_missing(vma)) {
3028 pte_unmap_unlock(vmf->pte, vmf->ptl);
3029 return handle_userfault(vmf, VM_UFFD_MISSING);
3030 }
3031 goto setpte;
3032 }
3033
3034
3035 if (unlikely(anon_vma_prepare(vma)))
3036 goto oom;
3037 page = alloc_zeroed_user_highpage_movable(vma, vmf->address);
3038 if (!page)
3039 goto oom;
3040
3041 if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL, &memcg,
3042 false))
3043 goto oom_free_page;
3044
3045
3046
3047
3048
3049
3050 __SetPageUptodate(page);
3051
3052 entry = mk_pte(page, vma->vm_page_prot);
3053 if (vma->vm_flags & VM_WRITE)
3054 entry = pte_mkwrite(pte_mkdirty(entry));
3055
3056 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
3057 &vmf->ptl);
3058 if (!pte_none(*vmf->pte))
3059 goto release;
3060
3061 ret = check_stable_address_space(vma->vm_mm);
3062 if (ret)
3063 goto release;
3064
3065
3066 if (userfaultfd_missing(vma)) {
3067 pte_unmap_unlock(vmf->pte, vmf->ptl);
3068 mem_cgroup_cancel_charge(page, memcg, false);
3069 put_page(page);
3070 return handle_userfault(vmf, VM_UFFD_MISSING);
3071 }
3072
3073 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
3074 page_add_new_anon_rmap(page, vma, vmf->address, false);
3075 mem_cgroup_commit_charge(page, memcg, false, false);
3076 lru_cache_add_active_or_unevictable(page, vma);
3077setpte:
3078 set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
3079
3080
3081 update_mmu_cache(vma, vmf->address, vmf->pte);
3082unlock:
3083 pte_unmap_unlock(vmf->pte, vmf->ptl);
3084 return ret;
3085release:
3086 mem_cgroup_cancel_charge(page, memcg, false);
3087 put_page(page);
3088 goto unlock;
3089oom_free_page:
3090 put_page(page);
3091oom:
3092 return VM_FAULT_OOM;
3093}
3094
3095
3096
3097
3098
3099
3100static vm_fault_t __do_fault(struct vm_fault *vmf)
3101{
3102 struct vm_area_struct *vma = vmf->vma;
3103 vm_fault_t ret;
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120 if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) {
3121 vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
3122 if (!vmf->prealloc_pte)
3123 return VM_FAULT_OOM;
3124 smp_wmb();
3125 }
3126
3127 ret = vma->vm_ops->fault(vmf);
3128 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
3129 VM_FAULT_DONE_COW)))
3130 return ret;
3131
3132 if (unlikely(PageHWPoison(vmf->page))) {
3133 if (ret & VM_FAULT_LOCKED)
3134 unlock_page(vmf->page);
3135 put_page(vmf->page);
3136 vmf->page = NULL;
3137 return VM_FAULT_HWPOISON;
3138 }
3139
3140 if (unlikely(!(ret & VM_FAULT_LOCKED)))
3141 lock_page(vmf->page);
3142 else
3143 VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page);
3144
3145 return ret;
3146}
3147
3148
3149
3150
3151
3152
3153
3154static int pmd_devmap_trans_unstable(pmd_t *pmd)
3155{
3156 return pmd_devmap(*pmd) || pmd_trans_unstable(pmd);
3157}
3158
3159static vm_fault_t pte_alloc_one_map(struct vm_fault *vmf)
3160{
3161 struct vm_area_struct *vma = vmf->vma;
3162
3163 if (!pmd_none(*vmf->pmd))
3164 goto map_pte;
3165 if (vmf->prealloc_pte) {
3166 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
3167 if (unlikely(!pmd_none(*vmf->pmd))) {
3168 spin_unlock(vmf->ptl);
3169 goto map_pte;
3170 }
3171
3172 mm_inc_nr_ptes(vma->vm_mm);
3173 pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
3174 spin_unlock(vmf->ptl);
3175 vmf->prealloc_pte = NULL;
3176 } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) {
3177 return VM_FAULT_OOM;
3178 }
3179map_pte:
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191 if (pmd_devmap_trans_unstable(vmf->pmd))
3192 return VM_FAULT_NOPAGE;
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
3204 &vmf->ptl);
3205 return 0;
3206}
3207
3208#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
3209
3210#define HPAGE_CACHE_INDEX_MASK (HPAGE_PMD_NR - 1)
3211static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
3212 unsigned long haddr)
3213{
3214 if (((vma->vm_start >> PAGE_SHIFT) & HPAGE_CACHE_INDEX_MASK) !=
3215 (vma->vm_pgoff & HPAGE_CACHE_INDEX_MASK))
3216 return false;
3217 if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
3218 return false;
3219 return true;
3220}
3221
3222static void deposit_prealloc_pte(struct vm_fault *vmf)
3223{
3224 struct vm_area_struct *vma = vmf->vma;
3225
3226 pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
3227
3228
3229
3230
3231 mm_inc_nr_ptes(vma->vm_mm);
3232 vmf->prealloc_pte = NULL;
3233}
3234
3235static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
3236{
3237 struct vm_area_struct *vma = vmf->vma;
3238 bool write = vmf->flags & FAULT_FLAG_WRITE;
3239 unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
3240 pmd_t entry;
3241 int i;
3242 vm_fault_t ret;
3243
3244 if (!transhuge_vma_suitable(vma, haddr))
3245 return VM_FAULT_FALLBACK;
3246
3247 ret = VM_FAULT_FALLBACK;
3248 page = compound_head(page);
3249
3250
3251
3252
3253
3254 if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
3255 vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
3256 if (!vmf->prealloc_pte)
3257 return VM_FAULT_OOM;
3258 smp_wmb();
3259 }
3260
3261 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
3262 if (unlikely(!pmd_none(*vmf->pmd)))
3263 goto out;
3264
3265 for (i = 0; i < HPAGE_PMD_NR; i++)
3266 flush_icache_page(vma, page + i);
3267
3268 entry = mk_huge_pmd(page, vma->vm_page_prot);
3269 if (write)
3270 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
3271
3272 add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR);
3273 page_add_file_rmap(page, true);
3274
3275
3276
3277 if (arch_needs_pgtable_deposit())
3278 deposit_prealloc_pte(vmf);
3279
3280 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
3281
3282 update_mmu_cache_pmd(vma, haddr, vmf->pmd);
3283
3284
3285 ret = 0;
3286 count_vm_event(THP_FILE_MAPPED);
3287out:
3288 spin_unlock(vmf->ptl);
3289 return ret;
3290}
3291#else
3292static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
3293{
3294 BUILD_BUG();
3295 return 0;
3296}
3297#endif
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
3316 struct page *page)
3317{
3318 struct vm_area_struct *vma = vmf->vma;
3319 bool write = vmf->flags & FAULT_FLAG_WRITE;
3320 pte_t entry;
3321 vm_fault_t ret;
3322
3323 if (pmd_none(*vmf->pmd) && PageTransCompound(page) &&
3324 IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) {
3325
3326 VM_BUG_ON_PAGE(memcg, page);
3327
3328 ret = do_set_pmd(vmf, page);
3329 if (ret != VM_FAULT_FALLBACK)
3330 return ret;
3331 }
3332
3333 if (!vmf->pte) {
3334 ret = pte_alloc_one_map(vmf);
3335 if (ret)
3336 return ret;
3337 }
3338
3339
3340 if (unlikely(!pte_none(*vmf->pte)))
3341 return VM_FAULT_NOPAGE;
3342
3343 flush_icache_page(vma, page);
3344 entry = mk_pte(page, vma->vm_page_prot);
3345 if (write)
3346 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
3347
3348 if (write && !(vma->vm_flags & VM_SHARED)) {
3349 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
3350 page_add_new_anon_rmap(page, vma, vmf->address, false);
3351 mem_cgroup_commit_charge(page, memcg, false, false);
3352 lru_cache_add_active_or_unevictable(page, vma);
3353 } else {
3354 inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
3355 page_add_file_rmap(page, false);
3356 }
3357 set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
3358
3359
3360 update_mmu_cache(vma, vmf->address, vmf->pte);
3361
3362 return 0;
3363}
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381vm_fault_t finish_fault(struct vm_fault *vmf)
3382{
3383 struct page *page;
3384 vm_fault_t ret = 0;
3385
3386
3387 if ((vmf->flags & FAULT_FLAG_WRITE) &&
3388 !(vmf->vma->vm_flags & VM_SHARED))
3389 page = vmf->cow_page;
3390 else
3391 page = vmf->page;
3392
3393
3394
3395
3396
3397 if (!(vmf->vma->vm_flags & VM_SHARED))
3398 ret = check_stable_address_space(vmf->vma->vm_mm);
3399 if (!ret)
3400 ret = alloc_set_pte(vmf, vmf->memcg, page);
3401 if (vmf->pte)
3402 pte_unmap_unlock(vmf->pte, vmf->ptl);
3403 return ret;
3404}
3405
3406static unsigned long fault_around_bytes __read_mostly =
3407 rounddown_pow_of_two(65536);
3408
3409#ifdef CONFIG_DEBUG_FS
3410static int fault_around_bytes_get(void *data, u64 *val)
3411{
3412 *val = fault_around_bytes;
3413 return 0;
3414}
3415
3416
3417
3418
3419
3420static int fault_around_bytes_set(void *data, u64 val)
3421{
3422 if (val / PAGE_SIZE > PTRS_PER_PTE)
3423 return -EINVAL;
3424 if (val > PAGE_SIZE)
3425 fault_around_bytes = rounddown_pow_of_two(val);
3426 else
3427 fault_around_bytes = PAGE_SIZE;
3428 return 0;
3429}
3430DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops,
3431 fault_around_bytes_get, fault_around_bytes_set, "%llu\n");
3432
3433static int __init fault_around_debugfs(void)
3434{
3435 debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL,
3436 &fault_around_bytes_fops);
3437 return 0;
3438}
3439late_initcall(fault_around_debugfs);
3440#endif
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466static vm_fault_t do_fault_around(struct vm_fault *vmf)
3467{
3468 unsigned long address = vmf->address, nr_pages, mask;
3469 pgoff_t start_pgoff = vmf->pgoff;
3470 pgoff_t end_pgoff;
3471 int off;
3472 vm_fault_t ret = 0;
3473
3474 nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
3475 mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
3476
3477 vmf->address = max(address & mask, vmf->vma->vm_start);
3478 off = ((address - vmf->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
3479 start_pgoff -= off;
3480
3481
3482
3483
3484
3485 end_pgoff = start_pgoff -
3486 ((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
3487 PTRS_PER_PTE - 1;
3488 end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1,
3489 start_pgoff + nr_pages - 1);
3490
3491 if (pmd_none(*vmf->pmd)) {
3492 vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
3493 if (!vmf->prealloc_pte)
3494 goto out;
3495 smp_wmb();
3496 }
3497
3498 vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);
3499
3500
3501 if (pmd_trans_huge(*vmf->pmd)) {
3502 ret = VM_FAULT_NOPAGE;
3503 goto out;
3504 }
3505
3506
3507 if (!vmf->pte)
3508 goto out;
3509
3510
3511 vmf->pte -= (vmf->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT);
3512 if (!pte_none(*vmf->pte))
3513 ret = VM_FAULT_NOPAGE;
3514 pte_unmap_unlock(vmf->pte, vmf->ptl);
3515out:
3516 vmf->address = address;
3517 vmf->pte = NULL;
3518 return ret;
3519}
3520
3521static vm_fault_t do_read_fault(struct vm_fault *vmf)
3522{
3523 struct vm_area_struct *vma = vmf->vma;
3524 vm_fault_t ret = 0;
3525
3526
3527
3528
3529
3530
3531 if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
3532 ret = do_fault_around(vmf);
3533 if (ret)
3534 return ret;
3535 }
3536
3537 ret = __do_fault(vmf);
3538 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3539 return ret;
3540
3541 ret |= finish_fault(vmf);
3542 unlock_page(vmf->page);
3543 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3544 put_page(vmf->page);
3545 return ret;
3546}
3547
3548static vm_fault_t do_cow_fault(struct vm_fault *vmf)
3549{
3550 struct vm_area_struct *vma = vmf->vma;
3551 vm_fault_t ret;
3552
3553 if (unlikely(anon_vma_prepare(vma)))
3554 return VM_FAULT_OOM;
3555
3556 vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address);
3557 if (!vmf->cow_page)
3558 return VM_FAULT_OOM;
3559
3560 if (mem_cgroup_try_charge_delay(vmf->cow_page, vma->vm_mm, GFP_KERNEL,
3561 &vmf->memcg, false)) {
3562 put_page(vmf->cow_page);
3563 return VM_FAULT_OOM;
3564 }
3565
3566 ret = __do_fault(vmf);
3567 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3568 goto uncharge_out;
3569 if (ret & VM_FAULT_DONE_COW)
3570 return ret;
3571
3572 copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma);
3573 __SetPageUptodate(vmf->cow_page);
3574
3575 ret |= finish_fault(vmf);
3576 unlock_page(vmf->page);
3577 put_page(vmf->page);
3578 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3579 goto uncharge_out;
3580 return ret;
3581uncharge_out:
3582 mem_cgroup_cancel_charge(vmf->cow_page, vmf->memcg, false);
3583 put_page(vmf->cow_page);
3584 return ret;
3585}
3586
3587static vm_fault_t do_shared_fault(struct vm_fault *vmf)
3588{
3589 struct vm_area_struct *vma = vmf->vma;
3590 vm_fault_t ret, tmp;
3591
3592 ret = __do_fault(vmf);
3593 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3594 return ret;
3595
3596
3597
3598
3599
3600 if (vma->vm_ops->page_mkwrite) {
3601 unlock_page(vmf->page);
3602 tmp = do_page_mkwrite(vmf);
3603 if (unlikely(!tmp ||
3604 (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
3605 put_page(vmf->page);
3606 return tmp;
3607 }
3608 }
3609
3610 ret |= finish_fault(vmf);
3611 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
3612 VM_FAULT_RETRY))) {
3613 unlock_page(vmf->page);
3614 put_page(vmf->page);
3615 return ret;
3616 }
3617
3618 fault_dirty_shared_page(vma, vmf->page);
3619 return ret;
3620}
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630static vm_fault_t do_fault(struct vm_fault *vmf)
3631{
3632 struct vm_area_struct *vma = vmf->vma;
3633 struct mm_struct *vm_mm = vma->vm_mm;
3634 vm_fault_t ret;
3635
3636
3637
3638
3639 if (!vma->vm_ops->fault) {
3640
3641
3642
3643
3644 if (unlikely(!pmd_present(*vmf->pmd)))
3645 ret = VM_FAULT_SIGBUS;
3646 else {
3647 vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm,
3648 vmf->pmd,
3649 vmf->address,
3650 &vmf->ptl);
3651
3652
3653
3654
3655
3656
3657
3658 if (unlikely(pte_none(*vmf->pte)))
3659 ret = VM_FAULT_SIGBUS;
3660 else
3661 ret = VM_FAULT_NOPAGE;
3662
3663 pte_unmap_unlock(vmf->pte, vmf->ptl);
3664 }
3665 } else if (!(vmf->flags & FAULT_FLAG_WRITE))
3666 ret = do_read_fault(vmf);
3667 else if (!(vma->vm_flags & VM_SHARED))
3668 ret = do_cow_fault(vmf);
3669 else
3670 ret = do_shared_fault(vmf);
3671
3672
3673 if (vmf->prealloc_pte) {
3674 pte_free(vm_mm, vmf->prealloc_pte);
3675 vmf->prealloc_pte = NULL;
3676 }
3677 return ret;
3678}
3679
3680static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
3681 unsigned long addr, int page_nid,
3682 int *flags)
3683{
3684 get_page(page);
3685
3686 count_vm_numa_event(NUMA_HINT_FAULTS);
3687 if (page_nid == numa_node_id()) {
3688 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
3689 *flags |= TNF_FAULT_LOCAL;
3690 }
3691
3692 return mpol_misplaced(page, vma, addr);
3693}
3694
3695static vm_fault_t do_numa_page(struct vm_fault *vmf)
3696{
3697 struct vm_area_struct *vma = vmf->vma;
3698 struct page *page = NULL;
3699 int page_nid = NUMA_NO_NODE;
3700 int last_cpupid;
3701 int target_nid;
3702 bool migrated = false;
3703 pte_t pte, old_pte;
3704 bool was_writable = pte_savedwrite(vmf->orig_pte);
3705 int flags = 0;
3706
3707
3708
3709
3710
3711
3712 vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd);
3713 spin_lock(vmf->ptl);
3714 if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
3715 pte_unmap_unlock(vmf->pte, vmf->ptl);
3716 goto out;
3717 }
3718
3719
3720
3721
3722
3723 old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte);
3724 pte = pte_modify(old_pte, vma->vm_page_prot);
3725 pte = pte_mkyoung(pte);
3726 if (was_writable)
3727 pte = pte_mkwrite(pte);
3728 ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte);
3729 update_mmu_cache(vma, vmf->address, vmf->pte);
3730
3731 page = vm_normal_page(vma, vmf->address, pte);
3732 if (!page) {
3733 pte_unmap_unlock(vmf->pte, vmf->ptl);
3734 return 0;
3735 }
3736
3737
3738 if (PageCompound(page)) {
3739 pte_unmap_unlock(vmf->pte, vmf->ptl);
3740 return 0;
3741 }
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751 if (!pte_write(pte))
3752 flags |= TNF_NO_GROUP;
3753
3754
3755
3756
3757
3758 if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
3759 flags |= TNF_SHARED;
3760
3761 last_cpupid = page_cpupid_last(page);
3762 page_nid = page_to_nid(page);
3763 target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
3764 &flags);
3765 pte_unmap_unlock(vmf->pte, vmf->ptl);
3766 if (target_nid == NUMA_NO_NODE) {
3767 put_page(page);
3768 goto out;
3769 }
3770
3771
3772 migrated = migrate_misplaced_page(page, vma, target_nid);
3773 if (migrated) {
3774 page_nid = target_nid;
3775 flags |= TNF_MIGRATED;
3776 } else
3777 flags |= TNF_MIGRATE_FAIL;
3778
3779out:
3780 if (page_nid != NUMA_NO_NODE)
3781 task_numa_fault(last_cpupid, page_nid, 1, flags);
3782 return 0;
3783}
3784
3785static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
3786{
3787 if (vma_is_anonymous(vmf->vma))
3788 return do_huge_pmd_anonymous_page(vmf);
3789 if (vmf->vma->vm_ops->huge_fault)
3790 return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
3791 return VM_FAULT_FALLBACK;
3792}
3793
3794
3795static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
3796{
3797 if (vma_is_anonymous(vmf->vma))
3798 return do_huge_pmd_wp_page(vmf, orig_pmd);
3799 if (vmf->vma->vm_ops->huge_fault)
3800 return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
3801
3802
3803 VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);
3804 __split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);
3805
3806 return VM_FAULT_FALLBACK;
3807}
3808
3809static inline bool vma_is_accessible(struct vm_area_struct *vma)
3810{
3811 return vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE);
3812}
3813
3814static vm_fault_t create_huge_pud(struct vm_fault *vmf)
3815{
3816#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3817
3818 if (vma_is_anonymous(vmf->vma))
3819 return VM_FAULT_FALLBACK;
3820 if (vmf->vma->vm_ops->huge_fault)
3821 return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
3822#endif
3823 return VM_FAULT_FALLBACK;
3824}
3825
3826static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
3827{
3828#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3829
3830 if (vma_is_anonymous(vmf->vma))
3831 return VM_FAULT_FALLBACK;
3832 if (vmf->vma->vm_ops->huge_fault)
3833 return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
3834#endif
3835 return VM_FAULT_FALLBACK;
3836}
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
3854{
3855 pte_t entry;
3856
3857 if (unlikely(pmd_none(*vmf->pmd))) {
3858
3859
3860
3861
3862
3863
3864 vmf->pte = NULL;
3865 } else {
3866
3867 if (pmd_devmap_trans_unstable(vmf->pmd))
3868 return 0;
3869
3870
3871
3872
3873
3874
3875 vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
3876 vmf->orig_pte = *vmf->pte;
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886 barrier();
3887 if (pte_none(vmf->orig_pte)) {
3888 pte_unmap(vmf->pte);
3889 vmf->pte = NULL;
3890 }
3891 }
3892
3893 if (!vmf->pte) {
3894 if (vma_is_anonymous(vmf->vma))
3895 return do_anonymous_page(vmf);
3896 else
3897 return do_fault(vmf);
3898 }
3899
3900 if (!pte_present(vmf->orig_pte))
3901 return do_swap_page(vmf);
3902
3903 if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
3904 return do_numa_page(vmf);
3905
3906 vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
3907 spin_lock(vmf->ptl);
3908 entry = vmf->orig_pte;
3909 if (unlikely(!pte_same(*vmf->pte, entry)))
3910 goto unlock;
3911 if (vmf->flags & FAULT_FLAG_WRITE) {
3912 if (!pte_write(entry))
3913 return do_wp_page(vmf);
3914 entry = pte_mkdirty(entry);
3915 }
3916 entry = pte_mkyoung(entry);
3917 if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
3918 vmf->flags & FAULT_FLAG_WRITE)) {
3919 update_mmu_cache(vmf->vma, vmf->address, vmf->pte);
3920 } else {
3921
3922
3923
3924
3925
3926
3927 if (vmf->flags & FAULT_FLAG_WRITE)
3928 flush_tlb_fix_spurious_fault(vmf->vma, vmf->address);
3929 }
3930unlock:
3931 pte_unmap_unlock(vmf->pte, vmf->ptl);
3932 return 0;
3933}
3934
3935
3936
3937
3938
3939
3940
3941static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
3942 unsigned long address, unsigned int flags)
3943{
3944 struct vm_fault vmf = {
3945 .vma = vma,
3946 .address = address & PAGE_MASK,
3947 .flags = flags,
3948 .pgoff = linear_page_index(vma, address),
3949 .gfp_mask = __get_fault_gfp_mask(vma),
3950 };
3951 unsigned int dirty = flags & FAULT_FLAG_WRITE;
3952 struct mm_struct *mm = vma->vm_mm;
3953 pgd_t *pgd;
3954 p4d_t *p4d;
3955 vm_fault_t ret;
3956
3957 pgd = pgd_offset(mm, address);
3958 p4d = p4d_alloc(mm, pgd, address);
3959 if (!p4d)
3960 return VM_FAULT_OOM;
3961
3962 vmf.pud = pud_alloc(mm, p4d, address);
3963 if (!vmf.pud)
3964 return VM_FAULT_OOM;
3965 if (pud_none(*vmf.pud) && __transparent_hugepage_enabled(vma)) {
3966 ret = create_huge_pud(&vmf);
3967 if (!(ret & VM_FAULT_FALLBACK))
3968 return ret;
3969 } else {
3970 pud_t orig_pud = *vmf.pud;
3971
3972 barrier();
3973 if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {
3974
3975
3976
3977 if (dirty && !pud_write(orig_pud)) {
3978 ret = wp_huge_pud(&vmf, orig_pud);
3979 if (!(ret & VM_FAULT_FALLBACK))
3980 return ret;
3981 } else {
3982 huge_pud_set_accessed(&vmf, orig_pud);
3983 return 0;
3984 }
3985 }
3986 }
3987
3988 vmf.pmd = pmd_alloc(mm, vmf.pud, address);
3989 if (!vmf.pmd)
3990 return VM_FAULT_OOM;
3991 if (pmd_none(*vmf.pmd) && __transparent_hugepage_enabled(vma)) {
3992 ret = create_huge_pmd(&vmf);
3993 if (!(ret & VM_FAULT_FALLBACK))
3994 return ret;
3995 } else {
3996 pmd_t orig_pmd = *vmf.pmd;
3997
3998 barrier();
3999 if (unlikely(is_swap_pmd(orig_pmd))) {
4000 VM_BUG_ON(thp_migration_supported() &&
4001 !is_pmd_migration_entry(orig_pmd));
4002 if (is_pmd_migration_entry(orig_pmd))
4003 pmd_migration_entry_wait(mm, vmf.pmd);
4004 return 0;
4005 }
4006 if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
4007 if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
4008 return do_huge_pmd_numa_page(&vmf, orig_pmd);
4009
4010 if (dirty && !pmd_write(orig_pmd)) {
4011 ret = wp_huge_pmd(&vmf, orig_pmd);
4012 if (!(ret & VM_FAULT_FALLBACK))
4013 return ret;
4014 } else {
4015 huge_pmd_set_accessed(&vmf, orig_pmd);
4016 return 0;
4017 }
4018 }
4019 }
4020
4021 return handle_pte_fault(&vmf);
4022}
4023
4024
4025
4026
4027
4028
4029
4030vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
4031 unsigned int flags)
4032{
4033 vm_fault_t ret;
4034
4035 __set_current_state(TASK_RUNNING);
4036
4037 count_vm_event(PGFAULT);
4038 count_memcg_event_mm(vma->vm_mm, PGFAULT);
4039
4040
4041 check_sync_rss_stat(current);
4042
4043 if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
4044 flags & FAULT_FLAG_INSTRUCTION,
4045 flags & FAULT_FLAG_REMOTE))
4046 return VM_FAULT_SIGSEGV;
4047
4048
4049
4050
4051
4052 if (flags & FAULT_FLAG_USER)
4053 mem_cgroup_enter_user_fault();
4054
4055 if (unlikely(is_vm_hugetlb_page(vma)))
4056 ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
4057 else
4058 ret = __handle_mm_fault(vma, address, flags);
4059
4060 if (flags & FAULT_FLAG_USER) {
4061 mem_cgroup_exit_user_fault();
4062
4063
4064
4065
4066
4067
4068 if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
4069 mem_cgroup_oom_synchronize(false);
4070 }
4071
4072 return ret;
4073}
4074EXPORT_SYMBOL_GPL(handle_mm_fault);
4075
4076#ifndef __PAGETABLE_P4D_FOLDED
4077
4078
4079
4080
4081int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
4082{
4083 p4d_t *new = p4d_alloc_one(mm, address);
4084 if (!new)
4085 return -ENOMEM;
4086
4087 smp_wmb();
4088
4089 spin_lock(&mm->page_table_lock);
4090 if (pgd_present(*pgd))
4091 p4d_free(mm, new);
4092 else
4093 pgd_populate(mm, pgd, new);
4094 spin_unlock(&mm->page_table_lock);
4095 return 0;
4096}
4097#endif
4098
4099#ifndef __PAGETABLE_PUD_FOLDED
4100
4101
4102
4103
4104int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address)
4105{
4106 pud_t *new = pud_alloc_one(mm, address);
4107 if (!new)
4108 return -ENOMEM;
4109
4110 smp_wmb();
4111
4112 spin_lock(&mm->page_table_lock);
4113#ifndef __ARCH_HAS_5LEVEL_HACK
4114 if (!p4d_present(*p4d)) {
4115 mm_inc_nr_puds(mm);
4116 p4d_populate(mm, p4d, new);
4117 } else
4118 pud_free(mm, new);
4119#else
4120 if (!pgd_present(*p4d)) {
4121 mm_inc_nr_puds(mm);
4122 pgd_populate(mm, p4d, new);
4123 } else
4124 pud_free(mm, new);
4125#endif
4126 spin_unlock(&mm->page_table_lock);
4127 return 0;
4128}
4129#endif
4130
4131#ifndef __PAGETABLE_PMD_FOLDED
4132
4133
4134
4135
4136int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
4137{
4138 spinlock_t *ptl;
4139 pmd_t *new = pmd_alloc_one(mm, address);
4140 if (!new)
4141 return -ENOMEM;
4142
4143 smp_wmb();
4144
4145 ptl = pud_lock(mm, pud);
4146#ifndef __ARCH_HAS_4LEVEL_HACK
4147 if (!pud_present(*pud)) {
4148 mm_inc_nr_pmds(mm);
4149 pud_populate(mm, pud, new);
4150 } else
4151 pmd_free(mm, new);
4152#else
4153 if (!pgd_present(*pud)) {
4154 mm_inc_nr_pmds(mm);
4155 pgd_populate(mm, pud, new);
4156 } else
4157 pmd_free(mm, new);
4158#endif
4159 spin_unlock(ptl);
4160 return 0;
4161}
4162#endif
4163
4164static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
4165 struct mmu_notifier_range *range,
4166 pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
4167{
4168 pgd_t *pgd;
4169 p4d_t *p4d;
4170 pud_t *pud;
4171 pmd_t *pmd;
4172 pte_t *ptep;
4173
4174 pgd = pgd_offset(mm, address);
4175 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
4176 goto out;
4177
4178 p4d = p4d_offset(pgd, address);
4179 if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d)))
4180 goto out;
4181
4182 pud = pud_offset(p4d, address);
4183 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
4184 goto out;
4185
4186 pmd = pmd_offset(pud, address);
4187 VM_BUG_ON(pmd_trans_huge(*pmd));
4188
4189 if (pmd_huge(*pmd)) {
4190 if (!pmdpp)
4191 goto out;
4192
4193 if (range) {
4194 mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0,
4195 NULL, mm, address & PMD_MASK,
4196 (address & PMD_MASK) + PMD_SIZE);
4197 mmu_notifier_invalidate_range_start(range);
4198 }
4199 *ptlp = pmd_lock(mm, pmd);
4200 if (pmd_huge(*pmd)) {
4201 *pmdpp = pmd;
4202 return 0;
4203 }
4204 spin_unlock(*ptlp);
4205 if (range)
4206 mmu_notifier_invalidate_range_end(range);
4207 }
4208
4209 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
4210 goto out;
4211
4212 if (range) {
4213 mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
4214 address & PAGE_MASK,
4215 (address & PAGE_MASK) + PAGE_SIZE);
4216 mmu_notifier_invalidate_range_start(range);
4217 }
4218 ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
4219 if (!pte_present(*ptep))
4220 goto unlock;
4221 *ptepp = ptep;
4222 return 0;
4223unlock:
4224 pte_unmap_unlock(ptep, *ptlp);
4225 if (range)
4226 mmu_notifier_invalidate_range_end(range);
4227out:
4228 return -EINVAL;
4229}
4230
4231static inline int follow_pte(struct mm_struct *mm, unsigned long address,
4232 pte_t **ptepp, spinlock_t **ptlp)
4233{
4234 int res;
4235
4236
4237 (void) __cond_lock(*ptlp,
4238 !(res = __follow_pte_pmd(mm, address, NULL,
4239 ptepp, NULL, ptlp)));
4240 return res;
4241}
4242
4243int follow_pte_pmd(struct mm_struct *mm, unsigned long address,
4244 struct mmu_notifier_range *range,
4245 pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
4246{
4247 int res;
4248
4249
4250 (void) __cond_lock(*ptlp,
4251 !(res = __follow_pte_pmd(mm, address, range,
4252 ptepp, pmdpp, ptlp)));
4253 return res;
4254}
4255EXPORT_SYMBOL(follow_pte_pmd);
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267int follow_pfn(struct vm_area_struct *vma, unsigned long address,
4268 unsigned long *pfn)
4269{
4270 int ret = -EINVAL;
4271 spinlock_t *ptl;
4272 pte_t *ptep;
4273
4274 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
4275 return ret;
4276
4277 ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
4278 if (ret)
4279 return ret;
4280 *pfn = pte_pfn(*ptep);
4281 pte_unmap_unlock(ptep, ptl);
4282 return 0;
4283}
4284EXPORT_SYMBOL(follow_pfn);
4285
4286#ifdef CONFIG_HAVE_IOREMAP_PROT
4287int follow_phys(struct vm_area_struct *vma,
4288 unsigned long address, unsigned int flags,
4289 unsigned long *prot, resource_size_t *phys)
4290{
4291 int ret = -EINVAL;
4292 pte_t *ptep, pte;
4293 spinlock_t *ptl;
4294
4295 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
4296 goto out;
4297
4298 if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
4299 goto out;
4300 pte = *ptep;
4301
4302 if ((flags & FOLL_WRITE) && !pte_write(pte))
4303 goto unlock;
4304
4305 *prot = pgprot_val(pte_pgprot(pte));
4306 *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
4307
4308 ret = 0;
4309unlock:
4310 pte_unmap_unlock(ptep, ptl);
4311out:
4312 return ret;
4313}
4314
4315int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
4316 void *buf, int len, int write)
4317{
4318 resource_size_t phys_addr;
4319 unsigned long prot = 0;
4320 void __iomem *maddr;
4321 int offset = addr & (PAGE_SIZE-1);
4322
4323 if (follow_phys(vma, addr, write, &prot, &phys_addr))
4324 return -EINVAL;
4325
4326 maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
4327 if (!maddr)
4328 return -ENOMEM;
4329
4330 if (write)
4331 memcpy_toio(maddr + offset, buf, len);
4332 else
4333 memcpy_fromio(buf, maddr + offset, len);
4334 iounmap(maddr);
4335
4336 return len;
4337}
4338EXPORT_SYMBOL_GPL(generic_access_phys);
4339#endif
4340
4341
4342
4343
4344
4345int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
4346 unsigned long addr, void *buf, int len, unsigned int gup_flags)
4347{
4348 struct vm_area_struct *vma;
4349 void *old_buf = buf;
4350 int write = gup_flags & FOLL_WRITE;
4351
4352 down_read(&mm->mmap_sem);
4353
4354 while (len) {
4355 int bytes, ret, offset;
4356 void *maddr;
4357 struct page *page = NULL;
4358
4359 ret = get_user_pages_remote(tsk, mm, addr, 1,
4360 gup_flags, &page, &vma, NULL);
4361 if (ret <= 0) {
4362#ifndef CONFIG_HAVE_IOREMAP_PROT
4363 break;
4364#else
4365
4366
4367
4368
4369 vma = find_vma(mm, addr);
4370 if (!vma || vma->vm_start > addr)
4371 break;
4372 if (vma->vm_ops && vma->vm_ops->access)
4373 ret = vma->vm_ops->access(vma, addr, buf,
4374 len, write);
4375 if (ret <= 0)
4376 break;
4377 bytes = ret;
4378#endif
4379 } else {
4380 bytes = len;
4381 offset = addr & (PAGE_SIZE-1);
4382 if (bytes > PAGE_SIZE-offset)
4383 bytes = PAGE_SIZE-offset;
4384
4385 maddr = kmap(page);
4386 if (write) {
4387 copy_to_user_page(vma, page, addr,
4388 maddr + offset, buf, bytes);
4389 set_page_dirty_lock(page);
4390 } else {
4391 copy_from_user_page(vma, page, addr,
4392 buf, maddr + offset, bytes);
4393 }
4394 kunmap(page);
4395 put_page(page);
4396 }
4397 len -= bytes;
4398 buf += bytes;
4399 addr += bytes;
4400 }
4401 up_read(&mm->mmap_sem);
4402
4403 return buf - old_buf;
4404}
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418int access_remote_vm(struct mm_struct *mm, unsigned long addr,
4419 void *buf, int len, unsigned int gup_flags)
4420{
4421 return __access_remote_vm(NULL, mm, addr, buf, len, gup_flags);
4422}
4423
4424
4425
4426
4427
4428
4429int access_process_vm(struct task_struct *tsk, unsigned long addr,
4430 void *buf, int len, unsigned int gup_flags)
4431{
4432 struct mm_struct *mm;
4433 int ret;
4434
4435 mm = get_task_mm(tsk);
4436 if (!mm)
4437 return 0;
4438
4439 ret = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags);
4440
4441 mmput(mm);
4442
4443 return ret;
4444}
4445EXPORT_SYMBOL_GPL(access_process_vm);
4446
4447
4448
4449
4450void print_vma_addr(char *prefix, unsigned long ip)
4451{
4452 struct mm_struct *mm = current->mm;
4453 struct vm_area_struct *vma;
4454
4455
4456
4457
4458 if (!down_read_trylock(&mm->mmap_sem))
4459 return;
4460
4461 vma = find_vma(mm, ip);
4462 if (vma && vma->vm_file) {
4463 struct file *f = vma->vm_file;
4464 char *buf = (char *)__get_free_page(GFP_NOWAIT);
4465 if (buf) {
4466 char *p;
4467
4468 p = file_path(f, buf, PAGE_SIZE);
4469 if (IS_ERR(p))
4470 p = "?";
4471 printk("%s%s[%lx+%lx]", prefix, kbasename(p),
4472 vma->vm_start,
4473 vma->vm_end - vma->vm_start);
4474 free_page((unsigned long)buf);
4475 }
4476 }
4477 up_read(&mm->mmap_sem);
4478}
4479
4480#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
4481void __might_fault(const char *file, int line)
4482{
4483
4484
4485
4486
4487
4488
4489 if (uaccess_kernel())
4490 return;
4491 if (pagefault_disabled())
4492 return;
4493 __might_sleep(file, line, 0);
4494#if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
4495 if (current->mm)
4496 might_lock_read(¤t->mm->mmap_sem);
4497#endif
4498}
4499EXPORT_SYMBOL(__might_fault);
4500#endif
4501
4502#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
4503
4504
4505
4506
4507
4508static inline void process_huge_page(
4509 unsigned long addr_hint, unsigned int pages_per_huge_page,
4510 void (*process_subpage)(unsigned long addr, int idx, void *arg),
4511 void *arg)
4512{
4513 int i, n, base, l;
4514 unsigned long addr = addr_hint &
4515 ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
4516
4517
4518 might_sleep();
4519 n = (addr_hint - addr) / PAGE_SIZE;
4520 if (2 * n <= pages_per_huge_page) {
4521
4522 base = 0;
4523 l = n;
4524
4525 for (i = pages_per_huge_page - 1; i >= 2 * n; i--) {
4526 cond_resched();
4527 process_subpage(addr + i * PAGE_SIZE, i, arg);
4528 }
4529 } else {
4530
4531 base = pages_per_huge_page - 2 * (pages_per_huge_page - n);
4532 l = pages_per_huge_page - n;
4533
4534 for (i = 0; i < base; i++) {
4535 cond_resched();
4536 process_subpage(addr + i * PAGE_SIZE, i, arg);
4537 }
4538 }
4539
4540
4541
4542
4543 for (i = 0; i < l; i++) {
4544 int left_idx = base + i;
4545 int right_idx = base + 2 * l - 1 - i;
4546
4547 cond_resched();
4548 process_subpage(addr + left_idx * PAGE_SIZE, left_idx, arg);
4549 cond_resched();
4550 process_subpage(addr + right_idx * PAGE_SIZE, right_idx, arg);
4551 }
4552}
4553
4554static void clear_gigantic_page(struct page *page,
4555 unsigned long addr,
4556 unsigned int pages_per_huge_page)
4557{
4558 int i;
4559 struct page *p = page;
4560
4561 might_sleep();
4562 for (i = 0; i < pages_per_huge_page;
4563 i++, p = mem_map_next(p, page, i)) {
4564 cond_resched();
4565 clear_user_highpage(p, addr + i * PAGE_SIZE);
4566 }
4567}
4568
4569static void clear_subpage(unsigned long addr, int idx, void *arg)
4570{
4571 struct page *page = arg;
4572
4573 clear_user_highpage(page + idx, addr);
4574}
4575
4576void clear_huge_page(struct page *page,
4577 unsigned long addr_hint, unsigned int pages_per_huge_page)
4578{
4579 unsigned long addr = addr_hint &
4580 ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
4581
4582 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
4583 clear_gigantic_page(page, addr, pages_per_huge_page);
4584 return;
4585 }
4586
4587 process_huge_page(addr_hint, pages_per_huge_page, clear_subpage, page);
4588}
4589
4590static void copy_user_gigantic_page(struct page *dst, struct page *src,
4591 unsigned long addr,
4592 struct vm_area_struct *vma,
4593 unsigned int pages_per_huge_page)
4594{
4595 int i;
4596 struct page *dst_base = dst;
4597 struct page *src_base = src;
4598
4599 for (i = 0; i < pages_per_huge_page; ) {
4600 cond_resched();
4601 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
4602
4603 i++;
4604 dst = mem_map_next(dst, dst_base, i);
4605 src = mem_map_next(src, src_base, i);
4606 }
4607}
4608
4609struct copy_subpage_arg {
4610 struct page *dst;
4611 struct page *src;
4612 struct vm_area_struct *vma;
4613};
4614
4615static void copy_subpage(unsigned long addr, int idx, void *arg)
4616{
4617 struct copy_subpage_arg *copy_arg = arg;
4618
4619 copy_user_highpage(copy_arg->dst + idx, copy_arg->src + idx,
4620 addr, copy_arg->vma);
4621}
4622
4623void copy_user_huge_page(struct page *dst, struct page *src,
4624 unsigned long addr_hint, struct vm_area_struct *vma,
4625 unsigned int pages_per_huge_page)
4626{
4627 unsigned long addr = addr_hint &
4628 ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
4629 struct copy_subpage_arg arg = {
4630 .dst = dst,
4631 .src = src,
4632 .vma = vma,
4633 };
4634
4635 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
4636 copy_user_gigantic_page(dst, src, addr, vma,
4637 pages_per_huge_page);
4638 return;
4639 }
4640
4641 process_huge_page(addr_hint, pages_per_huge_page, copy_subpage, &arg);
4642}
4643
4644long copy_huge_page_from_user(struct page *dst_page,
4645 const void __user *usr_src,
4646 unsigned int pages_per_huge_page,
4647 bool allow_pagefault)
4648{
4649 void *src = (void *)usr_src;
4650 void *page_kaddr;
4651 unsigned long i, rc = 0;
4652 unsigned long ret_val = pages_per_huge_page * PAGE_SIZE;
4653
4654 for (i = 0; i < pages_per_huge_page; i++) {
4655 if (allow_pagefault)
4656 page_kaddr = kmap(dst_page + i);
4657 else
4658 page_kaddr = kmap_atomic(dst_page + i);
4659 rc = copy_from_user(page_kaddr,
4660 (const void __user *)(src + i * PAGE_SIZE),
4661 PAGE_SIZE);
4662 if (allow_pagefault)
4663 kunmap(dst_page + i);
4664 else
4665 kunmap_atomic(page_kaddr);
4666
4667 ret_val -= (PAGE_SIZE - rc);
4668 if (rc)
4669 break;
4670
4671 cond_resched();
4672 }
4673 return ret_val;
4674}
4675#endif
4676
4677#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS
4678
4679static struct kmem_cache *page_ptl_cachep;
4680
4681void __init ptlock_cache_init(void)
4682{
4683 page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0,
4684 SLAB_PANIC, NULL);
4685}
4686
4687bool ptlock_alloc(struct page *page)
4688{
4689 spinlock_t *ptl;
4690
4691 ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
4692 if (!ptl)
4693 return false;
4694 page->ptl = ptl;
4695 return true;
4696}
4697
4698void ptlock_free(struct page *page)
4699{
4700 kmem_cache_free(page_ptl_cachep, page->ptl);
4701}
4702#endif
4703