1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42#include <linux/kernel_stat.h>
43#include <linux/mm.h>
44#include <linux/sched/mm.h>
45#include <linux/sched/coredump.h>
46#include <linux/sched/numa_balancing.h>
47#include <linux/sched/task.h>
48#include <linux/hugetlb.h>
49#include <linux/mman.h>
50#include <linux/swap.h>
51#include <linux/highmem.h>
52#include <linux/pagemap.h>
53#include <linux/memremap.h>
54#include <linux/ksm.h>
55#include <linux/rmap.h>
56#include <linux/export.h>
57#include <linux/delayacct.h>
58#include <linux/init.h>
59#include <linux/pfn_t.h>
60#include <linux/writeback.h>
61#include <linux/memcontrol.h>
62#include <linux/mmu_notifier.h>
63#include <linux/swapops.h>
64#include <linux/elf.h>
65#include <linux/gfp.h>
66#include <linux/migrate.h>
67#include <linux/string.h>
68#include <linux/dma-debug.h>
69#include <linux/debugfs.h>
70#include <linux/userfaultfd_k.h>
71#include <linux/dax.h>
72#include <linux/oom.h>
73#include <linux/numa.h>
74
75#include <asm/io.h>
76#include <asm/mmu_context.h>
77#include <asm/pgalloc.h>
78#include <linux/uaccess.h>
79#include <asm/tlb.h>
80#include <asm/tlbflush.h>
81#include <asm/pgtable.h>
82
83#include "internal.h"
84
85#if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST)
86#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
87#endif
88
89#ifndef CONFIG_NEED_MULTIPLE_NODES
90
91unsigned long max_mapnr;
92EXPORT_SYMBOL(max_mapnr);
93
94struct page *mem_map;
95EXPORT_SYMBOL(mem_map);
96#endif
97
98
99
100
101
102
103
104
105void *high_memory;
106EXPORT_SYMBOL(high_memory);
107
108
109
110
111
112
113
114int randomize_va_space __read_mostly =
115#ifdef CONFIG_COMPAT_BRK
116 1;
117#else
118 2;
119#endif
120
121static int __init disable_randmaps(char *s)
122{
123 randomize_va_space = 0;
124 return 1;
125}
126__setup("norandmaps", disable_randmaps);
127
128unsigned long zero_pfn __read_mostly;
129EXPORT_SYMBOL(zero_pfn);
130
131unsigned long highest_memmap_pfn __read_mostly;
132
133
134
135
136static int __init init_zero_pfn(void)
137{
138 zero_pfn = page_to_pfn(ZERO_PAGE(0));
139 return 0;
140}
141core_initcall(init_zero_pfn);
142
143
144#if defined(SPLIT_RSS_COUNTING)
145
146void sync_mm_rss(struct mm_struct *mm)
147{
148 int i;
149
150 for (i = 0; i < NR_MM_COUNTERS; i++) {
151 if (current->rss_stat.count[i]) {
152 add_mm_counter(mm, i, current->rss_stat.count[i]);
153 current->rss_stat.count[i] = 0;
154 }
155 }
156 current->rss_stat.events = 0;
157}
158
159static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
160{
161 struct task_struct *task = current;
162
163 if (likely(task->mm == mm))
164 task->rss_stat.count[member] += val;
165 else
166 add_mm_counter(mm, member, val);
167}
168#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
169#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
170
171
172#define TASK_RSS_EVENTS_THRESH (64)
173static void check_sync_rss_stat(struct task_struct *task)
174{
175 if (unlikely(task != current))
176 return;
177 if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
178 sync_mm_rss(task->mm);
179}
180#else
181
182#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
183#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
184
185static void check_sync_rss_stat(struct task_struct *task)
186{
187}
188
189#endif
190
191
192
193
194
195static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
196 unsigned long addr)
197{
198 pgtable_t token = pmd_pgtable(*pmd);
199 pmd_clear(pmd);
200 pte_free_tlb(tlb, token, addr);
201 mm_dec_nr_ptes(tlb->mm);
202}
203
204static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
205 unsigned long addr, unsigned long end,
206 unsigned long floor, unsigned long ceiling)
207{
208 pmd_t *pmd;
209 unsigned long next;
210 unsigned long start;
211
212 start = addr;
213 pmd = pmd_offset(pud, addr);
214 do {
215 next = pmd_addr_end(addr, end);
216 if (pmd_none_or_clear_bad(pmd))
217 continue;
218 free_pte_range(tlb, pmd, addr);
219 } while (pmd++, addr = next, addr != end);
220
221 start &= PUD_MASK;
222 if (start < floor)
223 return;
224 if (ceiling) {
225 ceiling &= PUD_MASK;
226 if (!ceiling)
227 return;
228 }
229 if (end - 1 > ceiling - 1)
230 return;
231
232 pmd = pmd_offset(pud, start);
233 pud_clear(pud);
234 pmd_free_tlb(tlb, pmd, start);
235 mm_dec_nr_pmds(tlb->mm);
236}
237
238static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
239 unsigned long addr, unsigned long end,
240 unsigned long floor, unsigned long ceiling)
241{
242 pud_t *pud;
243 unsigned long next;
244 unsigned long start;
245
246 start = addr;
247 pud = pud_offset(p4d, addr);
248 do {
249 next = pud_addr_end(addr, end);
250 if (pud_none_or_clear_bad(pud))
251 continue;
252 free_pmd_range(tlb, pud, addr, next, floor, ceiling);
253 } while (pud++, addr = next, addr != end);
254
255 start &= P4D_MASK;
256 if (start < floor)
257 return;
258 if (ceiling) {
259 ceiling &= P4D_MASK;
260 if (!ceiling)
261 return;
262 }
263 if (end - 1 > ceiling - 1)
264 return;
265
266 pud = pud_offset(p4d, start);
267 p4d_clear(p4d);
268 pud_free_tlb(tlb, pud, start);
269 mm_dec_nr_puds(tlb->mm);
270}
271
272static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
273 unsigned long addr, unsigned long end,
274 unsigned long floor, unsigned long ceiling)
275{
276 p4d_t *p4d;
277 unsigned long next;
278 unsigned long start;
279
280 start = addr;
281 p4d = p4d_offset(pgd, addr);
282 do {
283 next = p4d_addr_end(addr, end);
284 if (p4d_none_or_clear_bad(p4d))
285 continue;
286 free_pud_range(tlb, p4d, addr, next, floor, ceiling);
287 } while (p4d++, addr = next, addr != end);
288
289 start &= PGDIR_MASK;
290 if (start < floor)
291 return;
292 if (ceiling) {
293 ceiling &= PGDIR_MASK;
294 if (!ceiling)
295 return;
296 }
297 if (end - 1 > ceiling - 1)
298 return;
299
300 p4d = p4d_offset(pgd, start);
301 pgd_clear(pgd);
302 p4d_free_tlb(tlb, p4d, start);
303}
304
305
306
307
308void free_pgd_range(struct mmu_gather *tlb,
309 unsigned long addr, unsigned long end,
310 unsigned long floor, unsigned long ceiling)
311{
312 pgd_t *pgd;
313 unsigned long next;
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341 addr &= PMD_MASK;
342 if (addr < floor) {
343 addr += PMD_SIZE;
344 if (!addr)
345 return;
346 }
347 if (ceiling) {
348 ceiling &= PMD_MASK;
349 if (!ceiling)
350 return;
351 }
352 if (end - 1 > ceiling - 1)
353 end -= PMD_SIZE;
354 if (addr > end - 1)
355 return;
356
357
358
359
360 tlb_change_page_size(tlb, PAGE_SIZE);
361 pgd = pgd_offset(tlb->mm, addr);
362 do {
363 next = pgd_addr_end(addr, end);
364 if (pgd_none_or_clear_bad(pgd))
365 continue;
366 free_p4d_range(tlb, pgd, addr, next, floor, ceiling);
367 } while (pgd++, addr = next, addr != end);
368}
369
370void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
371 unsigned long floor, unsigned long ceiling)
372{
373 while (vma) {
374 struct vm_area_struct *next = vma->vm_next;
375 unsigned long addr = vma->vm_start;
376
377
378
379
380
381 unlink_anon_vmas(vma);
382 unlink_file_vma(vma);
383
384 if (is_vm_hugetlb_page(vma)) {
385 hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
386 floor, next ? next->vm_start : ceiling);
387 } else {
388
389
390
391 while (next && next->vm_start <= vma->vm_end + PMD_SIZE
392 && !is_vm_hugetlb_page(next)) {
393 vma = next;
394 next = vma->vm_next;
395 unlink_anon_vmas(vma);
396 unlink_file_vma(vma);
397 }
398 free_pgd_range(tlb, addr, vma->vm_end,
399 floor, next ? next->vm_start : ceiling);
400 }
401 vma = next;
402 }
403}
404
405int __pte_alloc(struct mm_struct *mm, pmd_t *pmd)
406{
407 spinlock_t *ptl;
408 pgtable_t new = pte_alloc_one(mm);
409 if (!new)
410 return -ENOMEM;
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425 smp_wmb();
426
427 ptl = pmd_lock(mm, pmd);
428 if (likely(pmd_none(*pmd))) {
429 mm_inc_nr_ptes(mm);
430 pmd_populate(mm, pmd, new);
431 new = NULL;
432 }
433 spin_unlock(ptl);
434 if (new)
435 pte_free(mm, new);
436 return 0;
437}
438
439int __pte_alloc_kernel(pmd_t *pmd)
440{
441 pte_t *new = pte_alloc_one_kernel(&init_mm);
442 if (!new)
443 return -ENOMEM;
444
445 smp_wmb();
446
447 spin_lock(&init_mm.page_table_lock);
448 if (likely(pmd_none(*pmd))) {
449 pmd_populate_kernel(&init_mm, pmd, new);
450 new = NULL;
451 }
452 spin_unlock(&init_mm.page_table_lock);
453 if (new)
454 pte_free_kernel(&init_mm, new);
455 return 0;
456}
457
458static inline void init_rss_vec(int *rss)
459{
460 memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
461}
462
463static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
464{
465 int i;
466
467 if (current->mm == mm)
468 sync_mm_rss(mm);
469 for (i = 0; i < NR_MM_COUNTERS; i++)
470 if (rss[i])
471 add_mm_counter(mm, i, rss[i]);
472}
473
474
475
476
477
478
479
480
481static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
482 pte_t pte, struct page *page)
483{
484 pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
485 p4d_t *p4d = p4d_offset(pgd, addr);
486 pud_t *pud = pud_offset(p4d, addr);
487 pmd_t *pmd = pmd_offset(pud, addr);
488 struct address_space *mapping;
489 pgoff_t index;
490 static unsigned long resume;
491 static unsigned long nr_shown;
492 static unsigned long nr_unshown;
493
494
495
496
497
498 if (nr_shown == 60) {
499 if (time_before(jiffies, resume)) {
500 nr_unshown++;
501 return;
502 }
503 if (nr_unshown) {
504 pr_alert("BUG: Bad page map: %lu messages suppressed\n",
505 nr_unshown);
506 nr_unshown = 0;
507 }
508 nr_shown = 0;
509 }
510 if (nr_shown++ == 0)
511 resume = jiffies + 60 * HZ;
512
513 mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
514 index = linear_page_index(vma, addr);
515
516 pr_alert("BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
517 current->comm,
518 (long long)pte_val(pte), (long long)pmd_val(*pmd));
519 if (page)
520 dump_page(page, "bad pte");
521 pr_alert("addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
522 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
523 pr_alert("file:%pD fault:%ps mmap:%ps readpage:%ps\n",
524 vma->vm_file,
525 vma->vm_ops ? vma->vm_ops->fault : NULL,
526 vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
527 mapping ? mapping->a_ops->readpage : NULL);
528 dump_stack();
529 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
530}
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
575 pte_t pte)
576{
577 unsigned long pfn = pte_pfn(pte);
578
579 if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) {
580 if (likely(!pte_special(pte)))
581 goto check_pfn;
582 if (vma->vm_ops && vma->vm_ops->find_special_page)
583 return vma->vm_ops->find_special_page(vma, addr);
584 if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
585 return NULL;
586 if (is_zero_pfn(pfn))
587 return NULL;
588 if (pte_devmap(pte))
589 return NULL;
590
591 print_bad_pte(vma, addr, pte, NULL);
592 return NULL;
593 }
594
595
596
597 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
598 if (vma->vm_flags & VM_MIXEDMAP) {
599 if (!pfn_valid(pfn))
600 return NULL;
601 goto out;
602 } else {
603 unsigned long off;
604 off = (addr - vma->vm_start) >> PAGE_SHIFT;
605 if (pfn == vma->vm_pgoff + off)
606 return NULL;
607 if (!is_cow_mapping(vma->vm_flags))
608 return NULL;
609 }
610 }
611
612 if (is_zero_pfn(pfn))
613 return NULL;
614
615check_pfn:
616 if (unlikely(pfn > highest_memmap_pfn)) {
617 print_bad_pte(vma, addr, pte, NULL);
618 return NULL;
619 }
620
621
622
623
624
625out:
626 return pfn_to_page(pfn);
627}
628
629#ifdef CONFIG_TRANSPARENT_HUGEPAGE
630struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
631 pmd_t pmd)
632{
633 unsigned long pfn = pmd_pfn(pmd);
634
635
636
637
638
639
640 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
641 if (vma->vm_flags & VM_MIXEDMAP) {
642 if (!pfn_valid(pfn))
643 return NULL;
644 goto out;
645 } else {
646 unsigned long off;
647 off = (addr - vma->vm_start) >> PAGE_SHIFT;
648 if (pfn == vma->vm_pgoff + off)
649 return NULL;
650 if (!is_cow_mapping(vma->vm_flags))
651 return NULL;
652 }
653 }
654
655 if (pmd_devmap(pmd))
656 return NULL;
657 if (is_zero_pfn(pfn))
658 return NULL;
659 if (unlikely(pfn > highest_memmap_pfn))
660 return NULL;
661
662
663
664
665
666out:
667 return pfn_to_page(pfn);
668}
669#endif
670
671
672
673
674
675
676
677static inline unsigned long
678copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
679 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
680 unsigned long addr, int *rss)
681{
682 unsigned long vm_flags = vma->vm_flags;
683 pte_t pte = *src_pte;
684 struct page *page;
685
686
687 if (unlikely(!pte_present(pte))) {
688 swp_entry_t entry = pte_to_swp_entry(pte);
689
690 if (likely(!non_swap_entry(entry))) {
691 if (swap_duplicate(entry) < 0)
692 return entry.val;
693
694
695 if (unlikely(list_empty(&dst_mm->mmlist))) {
696 spin_lock(&mmlist_lock);
697 if (list_empty(&dst_mm->mmlist))
698 list_add(&dst_mm->mmlist,
699 &src_mm->mmlist);
700 spin_unlock(&mmlist_lock);
701 }
702 rss[MM_SWAPENTS]++;
703 } else if (is_migration_entry(entry)) {
704 page = migration_entry_to_page(entry);
705
706 rss[mm_counter(page)]++;
707
708 if (is_write_migration_entry(entry) &&
709 is_cow_mapping(vm_flags)) {
710
711
712
713
714 make_migration_entry_read(&entry);
715 pte = swp_entry_to_pte(entry);
716 if (pte_swp_soft_dirty(*src_pte))
717 pte = pte_swp_mksoft_dirty(pte);
718 set_pte_at(src_mm, addr, src_pte, pte);
719 }
720 } else if (is_device_private_entry(entry)) {
721 page = device_private_entry_to_page(entry);
722
723
724
725
726
727
728
729
730
731
732 get_page(page);
733 rss[mm_counter(page)]++;
734 page_dup_rmap(page, false);
735
736
737
738
739
740
741
742
743 if (is_write_device_private_entry(entry) &&
744 is_cow_mapping(vm_flags)) {
745 make_device_private_entry_read(&entry);
746 pte = swp_entry_to_pte(entry);
747 set_pte_at(src_mm, addr, src_pte, pte);
748 }
749 }
750 goto out_set_pte;
751 }
752
753
754
755
756
757 if (is_cow_mapping(vm_flags) && pte_write(pte)) {
758 ptep_set_wrprotect(src_mm, addr, src_pte);
759 pte = pte_wrprotect(pte);
760 }
761
762
763
764
765
766 if (vm_flags & VM_SHARED)
767 pte = pte_mkclean(pte);
768 pte = pte_mkold(pte);
769
770 page = vm_normal_page(vma, addr, pte);
771 if (page) {
772 get_page(page);
773 page_dup_rmap(page, false);
774 rss[mm_counter(page)]++;
775 } else if (pte_devmap(pte)) {
776 page = pte_page(pte);
777 }
778
779out_set_pte:
780 set_pte_at(dst_mm, addr, dst_pte, pte);
781 return 0;
782}
783
784static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
785 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
786 unsigned long addr, unsigned long end)
787{
788 pte_t *orig_src_pte, *orig_dst_pte;
789 pte_t *src_pte, *dst_pte;
790 spinlock_t *src_ptl, *dst_ptl;
791 int progress = 0;
792 int rss[NR_MM_COUNTERS];
793 swp_entry_t entry = (swp_entry_t){0};
794
795again:
796 init_rss_vec(rss);
797
798 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
799 if (!dst_pte)
800 return -ENOMEM;
801 src_pte = pte_offset_map(src_pmd, addr);
802 src_ptl = pte_lockptr(src_mm, src_pmd);
803 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
804 orig_src_pte = src_pte;
805 orig_dst_pte = dst_pte;
806 arch_enter_lazy_mmu_mode();
807
808 do {
809
810
811
812
813 if (progress >= 32) {
814 progress = 0;
815 if (need_resched() ||
816 spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
817 break;
818 }
819 if (pte_none(*src_pte)) {
820 progress++;
821 continue;
822 }
823 entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
824 vma, addr, rss);
825 if (entry.val)
826 break;
827 progress += 8;
828 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
829
830 arch_leave_lazy_mmu_mode();
831 spin_unlock(src_ptl);
832 pte_unmap(orig_src_pte);
833 add_mm_rss_vec(dst_mm, rss);
834 pte_unmap_unlock(orig_dst_pte, dst_ptl);
835 cond_resched();
836
837 if (entry.val) {
838 if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
839 return -ENOMEM;
840 progress = 0;
841 }
842 if (addr != end)
843 goto again;
844 return 0;
845}
846
847static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
848 pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
849 unsigned long addr, unsigned long end)
850{
851 pmd_t *src_pmd, *dst_pmd;
852 unsigned long next;
853
854 dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
855 if (!dst_pmd)
856 return -ENOMEM;
857 src_pmd = pmd_offset(src_pud, addr);
858 do {
859 next = pmd_addr_end(addr, end);
860 if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)
861 || pmd_devmap(*src_pmd)) {
862 int err;
863 VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, vma);
864 err = copy_huge_pmd(dst_mm, src_mm,
865 dst_pmd, src_pmd, addr, vma);
866 if (err == -ENOMEM)
867 return -ENOMEM;
868 if (!err)
869 continue;
870
871 }
872 if (pmd_none_or_clear_bad(src_pmd))
873 continue;
874 if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
875 vma, addr, next))
876 return -ENOMEM;
877 } while (dst_pmd++, src_pmd++, addr = next, addr != end);
878 return 0;
879}
880
881static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
882 p4d_t *dst_p4d, p4d_t *src_p4d, struct vm_area_struct *vma,
883 unsigned long addr, unsigned long end)
884{
885 pud_t *src_pud, *dst_pud;
886 unsigned long next;
887
888 dst_pud = pud_alloc(dst_mm, dst_p4d, addr);
889 if (!dst_pud)
890 return -ENOMEM;
891 src_pud = pud_offset(src_p4d, addr);
892 do {
893 next = pud_addr_end(addr, end);
894 if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) {
895 int err;
896
897 VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, vma);
898 err = copy_huge_pud(dst_mm, src_mm,
899 dst_pud, src_pud, addr, vma);
900 if (err == -ENOMEM)
901 return -ENOMEM;
902 if (!err)
903 continue;
904
905 }
906 if (pud_none_or_clear_bad(src_pud))
907 continue;
908 if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
909 vma, addr, next))
910 return -ENOMEM;
911 } while (dst_pud++, src_pud++, addr = next, addr != end);
912 return 0;
913}
914
915static inline int copy_p4d_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
916 pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
917 unsigned long addr, unsigned long end)
918{
919 p4d_t *src_p4d, *dst_p4d;
920 unsigned long next;
921
922 dst_p4d = p4d_alloc(dst_mm, dst_pgd, addr);
923 if (!dst_p4d)
924 return -ENOMEM;
925 src_p4d = p4d_offset(src_pgd, addr);
926 do {
927 next = p4d_addr_end(addr, end);
928 if (p4d_none_or_clear_bad(src_p4d))
929 continue;
930 if (copy_pud_range(dst_mm, src_mm, dst_p4d, src_p4d,
931 vma, addr, next))
932 return -ENOMEM;
933 } while (dst_p4d++, src_p4d++, addr = next, addr != end);
934 return 0;
935}
936
937int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
938 struct vm_area_struct *vma)
939{
940 pgd_t *src_pgd, *dst_pgd;
941 unsigned long next;
942 unsigned long addr = vma->vm_start;
943 unsigned long end = vma->vm_end;
944 struct mmu_notifier_range range;
945 bool is_cow;
946 int ret;
947
948
949
950
951
952
953
954 if (!(vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) &&
955 !vma->anon_vma)
956 return 0;
957
958 if (is_vm_hugetlb_page(vma))
959 return copy_hugetlb_page_range(dst_mm, src_mm, vma);
960
961 if (unlikely(vma->vm_flags & VM_PFNMAP)) {
962
963
964
965
966 ret = track_pfn_copy(vma);
967 if (ret)
968 return ret;
969 }
970
971
972
973
974
975
976
977 is_cow = is_cow_mapping(vma->vm_flags);
978
979 if (is_cow) {
980 mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
981 0, vma, src_mm, addr, end);
982 mmu_notifier_invalidate_range_start(&range);
983 }
984
985 ret = 0;
986 dst_pgd = pgd_offset(dst_mm, addr);
987 src_pgd = pgd_offset(src_mm, addr);
988 do {
989 next = pgd_addr_end(addr, end);
990 if (pgd_none_or_clear_bad(src_pgd))
991 continue;
992 if (unlikely(copy_p4d_range(dst_mm, src_mm, dst_pgd, src_pgd,
993 vma, addr, next))) {
994 ret = -ENOMEM;
995 break;
996 }
997 } while (dst_pgd++, src_pgd++, addr = next, addr != end);
998
999 if (is_cow)
1000 mmu_notifier_invalidate_range_end(&range);
1001 return ret;
1002}
1003
1004static unsigned long zap_pte_range(struct mmu_gather *tlb,
1005 struct vm_area_struct *vma, pmd_t *pmd,
1006 unsigned long addr, unsigned long end,
1007 struct zap_details *details)
1008{
1009 struct mm_struct *mm = tlb->mm;
1010 int force_flush = 0;
1011 int rss[NR_MM_COUNTERS];
1012 spinlock_t *ptl;
1013 pte_t *start_pte;
1014 pte_t *pte;
1015 swp_entry_t entry;
1016
1017 tlb_change_page_size(tlb, PAGE_SIZE);
1018again:
1019 init_rss_vec(rss);
1020 start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
1021 pte = start_pte;
1022 flush_tlb_batched_pending(mm);
1023 arch_enter_lazy_mmu_mode();
1024 do {
1025 pte_t ptent = *pte;
1026 if (pte_none(ptent))
1027 continue;
1028
1029 if (pte_present(ptent)) {
1030 struct page *page;
1031
1032 page = vm_normal_page(vma, addr, ptent);
1033 if (unlikely(details) && page) {
1034
1035
1036
1037
1038
1039 if (details->check_mapping &&
1040 details->check_mapping != page_rmapping(page))
1041 continue;
1042 }
1043 ptent = ptep_get_and_clear_full(mm, addr, pte,
1044 tlb->fullmm);
1045 tlb_remove_tlb_entry(tlb, pte, addr);
1046 if (unlikely(!page))
1047 continue;
1048
1049 if (!PageAnon(page)) {
1050 if (pte_dirty(ptent)) {
1051 force_flush = 1;
1052 set_page_dirty(page);
1053 }
1054 if (pte_young(ptent) &&
1055 likely(!(vma->vm_flags & VM_SEQ_READ)))
1056 mark_page_accessed(page);
1057 }
1058 rss[mm_counter(page)]--;
1059 page_remove_rmap(page, false);
1060 if (unlikely(page_mapcount(page) < 0))
1061 print_bad_pte(vma, addr, ptent, page);
1062 if (unlikely(__tlb_remove_page(tlb, page))) {
1063 force_flush = 1;
1064 addr += PAGE_SIZE;
1065 break;
1066 }
1067 continue;
1068 }
1069
1070 entry = pte_to_swp_entry(ptent);
1071 if (non_swap_entry(entry) && is_device_private_entry(entry)) {
1072 struct page *page = device_private_entry_to_page(entry);
1073
1074 if (unlikely(details && details->check_mapping)) {
1075
1076
1077
1078
1079
1080 if (details->check_mapping !=
1081 page_rmapping(page))
1082 continue;
1083 }
1084
1085 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
1086 rss[mm_counter(page)]--;
1087 page_remove_rmap(page, false);
1088 put_page(page);
1089 continue;
1090 }
1091
1092
1093 if (unlikely(details))
1094 continue;
1095
1096 entry = pte_to_swp_entry(ptent);
1097 if (!non_swap_entry(entry))
1098 rss[MM_SWAPENTS]--;
1099 else if (is_migration_entry(entry)) {
1100 struct page *page;
1101
1102 page = migration_entry_to_page(entry);
1103 rss[mm_counter(page)]--;
1104 }
1105 if (unlikely(!free_swap_and_cache(entry)))
1106 print_bad_pte(vma, addr, ptent, NULL);
1107 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
1108 } while (pte++, addr += PAGE_SIZE, addr != end);
1109
1110 add_mm_rss_vec(mm, rss);
1111 arch_leave_lazy_mmu_mode();
1112
1113
1114 if (force_flush)
1115 tlb_flush_mmu_tlbonly(tlb);
1116 pte_unmap_unlock(start_pte, ptl);
1117
1118
1119
1120
1121
1122
1123
1124 if (force_flush) {
1125 force_flush = 0;
1126 tlb_flush_mmu(tlb);
1127 if (addr != end)
1128 goto again;
1129 }
1130
1131 return addr;
1132}
1133
1134static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1135 struct vm_area_struct *vma, pud_t *pud,
1136 unsigned long addr, unsigned long end,
1137 struct zap_details *details)
1138{
1139 pmd_t *pmd;
1140 unsigned long next;
1141
1142 pmd = pmd_offset(pud, addr);
1143 do {
1144 next = pmd_addr_end(addr, end);
1145 if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
1146 if (next - addr != HPAGE_PMD_SIZE)
1147 __split_huge_pmd(vma, pmd, addr, false, NULL);
1148 else if (zap_huge_pmd(tlb, vma, pmd, addr))
1149 goto next;
1150
1151 }
1152
1153
1154
1155
1156
1157
1158
1159 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
1160 goto next;
1161 next = zap_pte_range(tlb, vma, pmd, addr, next, details);
1162next:
1163 cond_resched();
1164 } while (pmd++, addr = next, addr != end);
1165
1166 return addr;
1167}
1168
1169static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1170 struct vm_area_struct *vma, p4d_t *p4d,
1171 unsigned long addr, unsigned long end,
1172 struct zap_details *details)
1173{
1174 pud_t *pud;
1175 unsigned long next;
1176
1177 pud = pud_offset(p4d, addr);
1178 do {
1179 next = pud_addr_end(addr, end);
1180 if (pud_trans_huge(*pud) || pud_devmap(*pud)) {
1181 if (next - addr != HPAGE_PUD_SIZE) {
1182 VM_BUG_ON_VMA(!rwsem_is_locked(&tlb->mm->mmap_sem), vma);
1183 split_huge_pud(vma, pud, addr);
1184 } else if (zap_huge_pud(tlb, vma, pud, addr))
1185 goto next;
1186
1187 }
1188 if (pud_none_or_clear_bad(pud))
1189 continue;
1190 next = zap_pmd_range(tlb, vma, pud, addr, next, details);
1191next:
1192 cond_resched();
1193 } while (pud++, addr = next, addr != end);
1194
1195 return addr;
1196}
1197
1198static inline unsigned long zap_p4d_range(struct mmu_gather *tlb,
1199 struct vm_area_struct *vma, pgd_t *pgd,
1200 unsigned long addr, unsigned long end,
1201 struct zap_details *details)
1202{
1203 p4d_t *p4d;
1204 unsigned long next;
1205
1206 p4d = p4d_offset(pgd, addr);
1207 do {
1208 next = p4d_addr_end(addr, end);
1209 if (p4d_none_or_clear_bad(p4d))
1210 continue;
1211 next = zap_pud_range(tlb, vma, p4d, addr, next, details);
1212 } while (p4d++, addr = next, addr != end);
1213
1214 return addr;
1215}
1216
1217void unmap_page_range(struct mmu_gather *tlb,
1218 struct vm_area_struct *vma,
1219 unsigned long addr, unsigned long end,
1220 struct zap_details *details)
1221{
1222 pgd_t *pgd;
1223 unsigned long next;
1224
1225 BUG_ON(addr >= end);
1226 tlb_start_vma(tlb, vma);
1227 pgd = pgd_offset(vma->vm_mm, addr);
1228 do {
1229 next = pgd_addr_end(addr, end);
1230 if (pgd_none_or_clear_bad(pgd))
1231 continue;
1232 next = zap_p4d_range(tlb, vma, pgd, addr, next, details);
1233 } while (pgd++, addr = next, addr != end);
1234 tlb_end_vma(tlb, vma);
1235}
1236
1237
1238static void unmap_single_vma(struct mmu_gather *tlb,
1239 struct vm_area_struct *vma, unsigned long start_addr,
1240 unsigned long end_addr,
1241 struct zap_details *details)
1242{
1243 unsigned long start = max(vma->vm_start, start_addr);
1244 unsigned long end;
1245
1246 if (start >= vma->vm_end)
1247 return;
1248 end = min(vma->vm_end, end_addr);
1249 if (end <= vma->vm_start)
1250 return;
1251
1252 if (vma->vm_file)
1253 uprobe_munmap(vma, start, end);
1254
1255 if (unlikely(vma->vm_flags & VM_PFNMAP))
1256 untrack_pfn(vma, 0, 0);
1257
1258 if (start != end) {
1259 if (unlikely(is_vm_hugetlb_page(vma))) {
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271 if (vma->vm_file) {
1272 i_mmap_lock_write(vma->vm_file->f_mapping);
1273 __unmap_hugepage_range_final(tlb, vma, start, end, NULL);
1274 i_mmap_unlock_write(vma->vm_file->f_mapping);
1275 }
1276 } else
1277 unmap_page_range(tlb, vma, start, end, details);
1278 }
1279}
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299void unmap_vmas(struct mmu_gather *tlb,
1300 struct vm_area_struct *vma, unsigned long start_addr,
1301 unsigned long end_addr)
1302{
1303 struct mmu_notifier_range range;
1304
1305 mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
1306 start_addr, end_addr);
1307 mmu_notifier_invalidate_range_start(&range);
1308 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
1309 unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
1310 mmu_notifier_invalidate_range_end(&range);
1311}
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321void zap_page_range(struct vm_area_struct *vma, unsigned long start,
1322 unsigned long size)
1323{
1324 struct mmu_notifier_range range;
1325 struct mmu_gather tlb;
1326
1327 lru_add_drain();
1328 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
1329 start, start + size);
1330 tlb_gather_mmu(&tlb, vma->vm_mm, start, range.end);
1331 update_hiwater_rss(vma->vm_mm);
1332 mmu_notifier_invalidate_range_start(&range);
1333 for ( ; vma && vma->vm_start < range.end; vma = vma->vm_next)
1334 unmap_single_vma(&tlb, vma, start, range.end, NULL);
1335 mmu_notifier_invalidate_range_end(&range);
1336 tlb_finish_mmu(&tlb, start, range.end);
1337}
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
1349 unsigned long size, struct zap_details *details)
1350{
1351 struct mmu_notifier_range range;
1352 struct mmu_gather tlb;
1353
1354 lru_add_drain();
1355 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
1356 address, address + size);
1357 tlb_gather_mmu(&tlb, vma->vm_mm, address, range.end);
1358 update_hiwater_rss(vma->vm_mm);
1359 mmu_notifier_invalidate_range_start(&range);
1360 unmap_single_vma(&tlb, vma, address, range.end, details);
1361 mmu_notifier_invalidate_range_end(&range);
1362 tlb_finish_mmu(&tlb, address, range.end);
1363}
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
1377 unsigned long size)
1378{
1379 if (address < vma->vm_start || address + size > vma->vm_end ||
1380 !(vma->vm_flags & VM_PFNMAP))
1381 return;
1382
1383 zap_page_range_single(vma, address, size, NULL);
1384}
1385EXPORT_SYMBOL_GPL(zap_vma_ptes);
1386
1387pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
1388 spinlock_t **ptl)
1389{
1390 pgd_t *pgd;
1391 p4d_t *p4d;
1392 pud_t *pud;
1393 pmd_t *pmd;
1394
1395 pgd = pgd_offset(mm, addr);
1396 p4d = p4d_alloc(mm, pgd, addr);
1397 if (!p4d)
1398 return NULL;
1399 pud = pud_alloc(mm, p4d, addr);
1400 if (!pud)
1401 return NULL;
1402 pmd = pmd_alloc(mm, pud, addr);
1403 if (!pmd)
1404 return NULL;
1405
1406 VM_BUG_ON(pmd_trans_huge(*pmd));
1407 return pte_alloc_map_lock(mm, pmd, addr, ptl);
1408}
1409
1410
1411
1412
1413
1414
1415
1416
1417static int insert_page(struct vm_area_struct *vma, unsigned long addr,
1418 struct page *page, pgprot_t prot)
1419{
1420 struct mm_struct *mm = vma->vm_mm;
1421 int retval;
1422 pte_t *pte;
1423 spinlock_t *ptl;
1424
1425 retval = -EINVAL;
1426 if (PageAnon(page) || PageSlab(page) || page_has_type(page))
1427 goto out;
1428 retval = -ENOMEM;
1429 flush_dcache_page(page);
1430 pte = get_locked_pte(mm, addr, &ptl);
1431 if (!pte)
1432 goto out;
1433 retval = -EBUSY;
1434 if (!pte_none(*pte))
1435 goto out_unlock;
1436
1437
1438 get_page(page);
1439 inc_mm_counter_fast(mm, mm_counter_file(page));
1440 page_add_file_rmap(page, false);
1441 set_pte_at(mm, addr, pte, mk_pte(page, prot));
1442
1443 retval = 0;
1444out_unlock:
1445 pte_unmap_unlock(pte, ptl);
1446out:
1447 return retval;
1448}
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
1480 struct page *page)
1481{
1482 if (addr < vma->vm_start || addr >= vma->vm_end)
1483 return -EFAULT;
1484 if (!page_count(page))
1485 return -EINVAL;
1486 if (!(vma->vm_flags & VM_MIXEDMAP)) {
1487 BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem));
1488 BUG_ON(vma->vm_flags & VM_PFNMAP);
1489 vma->vm_flags |= VM_MIXEDMAP;
1490 }
1491 return insert_page(vma, addr, page, vma->vm_page_prot);
1492}
1493EXPORT_SYMBOL(vm_insert_page);
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506static int __vm_map_pages(struct vm_area_struct *vma, struct page **pages,
1507 unsigned long num, unsigned long offset)
1508{
1509 unsigned long count = vma_pages(vma);
1510 unsigned long uaddr = vma->vm_start;
1511 int ret, i;
1512
1513
1514 if (offset >= num)
1515 return -ENXIO;
1516
1517
1518 if (count > num - offset)
1519 return -ENXIO;
1520
1521 for (i = 0; i < count; i++) {
1522 ret = vm_insert_page(vma, uaddr, pages[offset + i]);
1523 if (ret < 0)
1524 return ret;
1525 uaddr += PAGE_SIZE;
1526 }
1527
1528 return 0;
1529}
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
1550 unsigned long num)
1551{
1552 return __vm_map_pages(vma, pages, num, vma->vm_pgoff);
1553}
1554EXPORT_SYMBOL(vm_map_pages);
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
1570 unsigned long num)
1571{
1572 return __vm_map_pages(vma, pages, num, 0);
1573}
1574EXPORT_SYMBOL(vm_map_pages_zero);
1575
1576static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1577 pfn_t pfn, pgprot_t prot, bool mkwrite)
1578{
1579 struct mm_struct *mm = vma->vm_mm;
1580 pte_t *pte, entry;
1581 spinlock_t *ptl;
1582
1583 pte = get_locked_pte(mm, addr, &ptl);
1584 if (!pte)
1585 return VM_FAULT_OOM;
1586 if (!pte_none(*pte)) {
1587 if (mkwrite) {
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598 if (pte_pfn(*pte) != pfn_t_to_pfn(pfn)) {
1599 WARN_ON_ONCE(!is_zero_pfn(pte_pfn(*pte)));
1600 goto out_unlock;
1601 }
1602 entry = pte_mkyoung(*pte);
1603 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1604 if (ptep_set_access_flags(vma, addr, pte, entry, 1))
1605 update_mmu_cache(vma, addr, pte);
1606 }
1607 goto out_unlock;
1608 }
1609
1610
1611 if (pfn_t_devmap(pfn))
1612 entry = pte_mkdevmap(pfn_t_pte(pfn, prot));
1613 else
1614 entry = pte_mkspecial(pfn_t_pte(pfn, prot));
1615
1616 if (mkwrite) {
1617 entry = pte_mkyoung(entry);
1618 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1619 }
1620
1621 set_pte_at(mm, addr, pte, entry);
1622 update_mmu_cache(vma, addr, pte);
1623
1624out_unlock:
1625 pte_unmap_unlock(pte, ptl);
1626 return VM_FAULT_NOPAGE;
1627}
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
1648 unsigned long pfn, pgprot_t pgprot)
1649{
1650
1651
1652
1653
1654
1655
1656 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
1657 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
1658 (VM_PFNMAP|VM_MIXEDMAP));
1659 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
1660 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
1661
1662 if (addr < vma->vm_start || addr >= vma->vm_end)
1663 return VM_FAULT_SIGBUS;
1664
1665 if (!pfn_modify_allowed(pfn, pgprot))
1666 return VM_FAULT_SIGBUS;
1667
1668 track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
1669
1670 return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
1671 false);
1672}
1673EXPORT_SYMBOL(vmf_insert_pfn_prot);
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1696 unsigned long pfn)
1697{
1698 return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
1699}
1700EXPORT_SYMBOL(vmf_insert_pfn);
1701
1702static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
1703{
1704
1705 if (vma->vm_flags & VM_MIXEDMAP)
1706 return true;
1707 if (pfn_t_devmap(pfn))
1708 return true;
1709 if (pfn_t_special(pfn))
1710 return true;
1711 if (is_zero_pfn(pfn_t_to_pfn(pfn)))
1712 return true;
1713 return false;
1714}
1715
1716static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
1717 unsigned long addr, pfn_t pfn, bool mkwrite)
1718{
1719 pgprot_t pgprot = vma->vm_page_prot;
1720 int err;
1721
1722 BUG_ON(!vm_mixed_ok(vma, pfn));
1723
1724 if (addr < vma->vm_start || addr >= vma->vm_end)
1725 return VM_FAULT_SIGBUS;
1726
1727 track_pfn_insert(vma, &pgprot, pfn);
1728
1729 if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot))
1730 return VM_FAULT_SIGBUS;
1731
1732
1733
1734
1735
1736
1737
1738
1739 if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) &&
1740 !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) {
1741 struct page *page;
1742
1743
1744
1745
1746
1747
1748 page = pfn_to_page(pfn_t_to_pfn(pfn));
1749 err = insert_page(vma, addr, page, pgprot);
1750 } else {
1751 return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
1752 }
1753
1754 if (err == -ENOMEM)
1755 return VM_FAULT_OOM;
1756 if (err < 0 && err != -EBUSY)
1757 return VM_FAULT_SIGBUS;
1758
1759 return VM_FAULT_NOPAGE;
1760}
1761
1762vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
1763 pfn_t pfn)
1764{
1765 return __vm_insert_mixed(vma, addr, pfn, false);
1766}
1767EXPORT_SYMBOL(vmf_insert_mixed);
1768
1769
1770
1771
1772
1773
1774vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
1775 unsigned long addr, pfn_t pfn)
1776{
1777 return __vm_insert_mixed(vma, addr, pfn, true);
1778}
1779EXPORT_SYMBOL(vmf_insert_mixed_mkwrite);
1780
1781
1782
1783
1784
1785
1786static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1787 unsigned long addr, unsigned long end,
1788 unsigned long pfn, pgprot_t prot)
1789{
1790 pte_t *pte;
1791 spinlock_t *ptl;
1792 int err = 0;
1793
1794 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
1795 if (!pte)
1796 return -ENOMEM;
1797 arch_enter_lazy_mmu_mode();
1798 do {
1799 BUG_ON(!pte_none(*pte));
1800 if (!pfn_modify_allowed(pfn, prot)) {
1801 err = -EACCES;
1802 break;
1803 }
1804 set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
1805 pfn++;
1806 } while (pte++, addr += PAGE_SIZE, addr != end);
1807 arch_leave_lazy_mmu_mode();
1808 pte_unmap_unlock(pte - 1, ptl);
1809 return err;
1810}
1811
1812static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
1813 unsigned long addr, unsigned long end,
1814 unsigned long pfn, pgprot_t prot)
1815{
1816 pmd_t *pmd;
1817 unsigned long next;
1818 int err;
1819
1820 pfn -= addr >> PAGE_SHIFT;
1821 pmd = pmd_alloc(mm, pud, addr);
1822 if (!pmd)
1823 return -ENOMEM;
1824 VM_BUG_ON(pmd_trans_huge(*pmd));
1825 do {
1826 next = pmd_addr_end(addr, end);
1827 err = remap_pte_range(mm, pmd, addr, next,
1828 pfn + (addr >> PAGE_SHIFT), prot);
1829 if (err)
1830 return err;
1831 } while (pmd++, addr = next, addr != end);
1832 return 0;
1833}
1834
1835static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d,
1836 unsigned long addr, unsigned long end,
1837 unsigned long pfn, pgprot_t prot)
1838{
1839 pud_t *pud;
1840 unsigned long next;
1841 int err;
1842
1843 pfn -= addr >> PAGE_SHIFT;
1844 pud = pud_alloc(mm, p4d, addr);
1845 if (!pud)
1846 return -ENOMEM;
1847 do {
1848 next = pud_addr_end(addr, end);
1849 err = remap_pmd_range(mm, pud, addr, next,
1850 pfn + (addr >> PAGE_SHIFT), prot);
1851 if (err)
1852 return err;
1853 } while (pud++, addr = next, addr != end);
1854 return 0;
1855}
1856
1857static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
1858 unsigned long addr, unsigned long end,
1859 unsigned long pfn, pgprot_t prot)
1860{
1861 p4d_t *p4d;
1862 unsigned long next;
1863 int err;
1864
1865 pfn -= addr >> PAGE_SHIFT;
1866 p4d = p4d_alloc(mm, pgd, addr);
1867 if (!p4d)
1868 return -ENOMEM;
1869 do {
1870 next = p4d_addr_end(addr, end);
1871 err = remap_pud_range(mm, p4d, addr, next,
1872 pfn + (addr >> PAGE_SHIFT), prot);
1873 if (err)
1874 return err;
1875 } while (p4d++, addr = next, addr != end);
1876 return 0;
1877}
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1892 unsigned long pfn, unsigned long size, pgprot_t prot)
1893{
1894 pgd_t *pgd;
1895 unsigned long next;
1896 unsigned long end = addr + PAGE_ALIGN(size);
1897 struct mm_struct *mm = vma->vm_mm;
1898 unsigned long remap_pfn = pfn;
1899 int err;
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919 if (is_cow_mapping(vma->vm_flags)) {
1920 if (addr != vma->vm_start || end != vma->vm_end)
1921 return -EINVAL;
1922 vma->vm_pgoff = pfn;
1923 }
1924
1925 err = track_pfn_remap(vma, &prot, remap_pfn, addr, PAGE_ALIGN(size));
1926 if (err)
1927 return -EINVAL;
1928
1929 vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
1930
1931 BUG_ON(addr >= end);
1932 pfn -= addr >> PAGE_SHIFT;
1933 pgd = pgd_offset(mm, addr);
1934 flush_cache_range(vma, addr, end);
1935 do {
1936 next = pgd_addr_end(addr, end);
1937 err = remap_p4d_range(mm, pgd, addr, next,
1938 pfn + (addr >> PAGE_SHIFT), prot);
1939 if (err)
1940 break;
1941 } while (pgd++, addr = next, addr != end);
1942
1943 if (err)
1944 untrack_pfn(vma, remap_pfn, PAGE_ALIGN(size));
1945
1946 return err;
1947}
1948EXPORT_SYMBOL(remap_pfn_range);
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
1966{
1967 unsigned long vm_len, pfn, pages;
1968
1969
1970 if (start + len < start)
1971 return -EINVAL;
1972
1973
1974
1975
1976
1977 len += start & ~PAGE_MASK;
1978 pfn = start >> PAGE_SHIFT;
1979 pages = (len + ~PAGE_MASK) >> PAGE_SHIFT;
1980 if (pfn + pages < pfn)
1981 return -EINVAL;
1982
1983
1984 if (vma->vm_pgoff > pages)
1985 return -EINVAL;
1986 pfn += vma->vm_pgoff;
1987 pages -= vma->vm_pgoff;
1988
1989
1990 vm_len = vma->vm_end - vma->vm_start;
1991 if (vm_len >> PAGE_SHIFT > pages)
1992 return -EINVAL;
1993
1994
1995 return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
1996}
1997EXPORT_SYMBOL(vm_iomap_memory);
1998
1999static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
2000 unsigned long addr, unsigned long end,
2001 pte_fn_t fn, void *data)
2002{
2003 pte_t *pte;
2004 int err;
2005 spinlock_t *uninitialized_var(ptl);
2006
2007 pte = (mm == &init_mm) ?
2008 pte_alloc_kernel(pmd, addr) :
2009 pte_alloc_map_lock(mm, pmd, addr, &ptl);
2010 if (!pte)
2011 return -ENOMEM;
2012
2013 BUG_ON(pmd_huge(*pmd));
2014
2015 arch_enter_lazy_mmu_mode();
2016
2017 do {
2018 err = fn(pte++, addr, data);
2019 if (err)
2020 break;
2021 } while (addr += PAGE_SIZE, addr != end);
2022
2023 arch_leave_lazy_mmu_mode();
2024
2025 if (mm != &init_mm)
2026 pte_unmap_unlock(pte-1, ptl);
2027 return err;
2028}
2029
2030static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
2031 unsigned long addr, unsigned long end,
2032 pte_fn_t fn, void *data)
2033{
2034 pmd_t *pmd;
2035 unsigned long next;
2036 int err;
2037
2038 BUG_ON(pud_huge(*pud));
2039
2040 pmd = pmd_alloc(mm, pud, addr);
2041 if (!pmd)
2042 return -ENOMEM;
2043 do {
2044 next = pmd_addr_end(addr, end);
2045 err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
2046 if (err)
2047 break;
2048 } while (pmd++, addr = next, addr != end);
2049 return err;
2050}
2051
2052static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
2053 unsigned long addr, unsigned long end,
2054 pte_fn_t fn, void *data)
2055{
2056 pud_t *pud;
2057 unsigned long next;
2058 int err;
2059
2060 pud = pud_alloc(mm, p4d, addr);
2061 if (!pud)
2062 return -ENOMEM;
2063 do {
2064 next = pud_addr_end(addr, end);
2065 err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
2066 if (err)
2067 break;
2068 } while (pud++, addr = next, addr != end);
2069 return err;
2070}
2071
2072static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
2073 unsigned long addr, unsigned long end,
2074 pte_fn_t fn, void *data)
2075{
2076 p4d_t *p4d;
2077 unsigned long next;
2078 int err;
2079
2080 p4d = p4d_alloc(mm, pgd, addr);
2081 if (!p4d)
2082 return -ENOMEM;
2083 do {
2084 next = p4d_addr_end(addr, end);
2085 err = apply_to_pud_range(mm, p4d, addr, next, fn, data);
2086 if (err)
2087 break;
2088 } while (p4d++, addr = next, addr != end);
2089 return err;
2090}
2091
2092
2093
2094
2095
2096int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
2097 unsigned long size, pte_fn_t fn, void *data)
2098{
2099 pgd_t *pgd;
2100 unsigned long next;
2101 unsigned long end = addr + size;
2102 int err;
2103
2104 if (WARN_ON(addr >= end))
2105 return -EINVAL;
2106
2107 pgd = pgd_offset(mm, addr);
2108 do {
2109 next = pgd_addr_end(addr, end);
2110 err = apply_to_p4d_range(mm, pgd, addr, next, fn, data);
2111 if (err)
2112 break;
2113 } while (pgd++, addr = next, addr != end);
2114
2115 return err;
2116}
2117EXPORT_SYMBOL_GPL(apply_to_page_range);
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
2128 pte_t *page_table, pte_t orig_pte)
2129{
2130 int same = 1;
2131#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
2132 if (sizeof(pte_t) > sizeof(unsigned long)) {
2133 spinlock_t *ptl = pte_lockptr(mm, pmd);
2134 spin_lock(ptl);
2135 same = pte_same(*page_table, orig_pte);
2136 spin_unlock(ptl);
2137 }
2138#endif
2139 pte_unmap(page_table);
2140 return same;
2141}
2142
2143static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
2144{
2145 debug_dma_assert_idle(src);
2146
2147
2148
2149
2150
2151
2152
2153 if (unlikely(!src)) {
2154 void *kaddr = kmap_atomic(dst);
2155 void __user *uaddr = (void __user *)(va & PAGE_MASK);
2156
2157
2158
2159
2160
2161
2162
2163 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
2164 clear_page(kaddr);
2165 kunmap_atomic(kaddr);
2166 flush_dcache_page(dst);
2167 } else
2168 copy_user_highpage(dst, src, va, vma);
2169}
2170
2171static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
2172{
2173 struct file *vm_file = vma->vm_file;
2174
2175 if (vm_file)
2176 return mapping_gfp_mask(vm_file->f_mapping) | __GFP_FS | __GFP_IO;
2177
2178
2179
2180
2181
2182 return GFP_KERNEL;
2183}
2184
2185
2186
2187
2188
2189
2190
2191static vm_fault_t do_page_mkwrite(struct vm_fault *vmf)
2192{
2193 vm_fault_t ret;
2194 struct page *page = vmf->page;
2195 unsigned int old_flags = vmf->flags;
2196
2197 vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
2198
2199 ret = vmf->vma->vm_ops->page_mkwrite(vmf);
2200
2201 vmf->flags = old_flags;
2202 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
2203 return ret;
2204 if (unlikely(!(ret & VM_FAULT_LOCKED))) {
2205 lock_page(page);
2206 if (!page->mapping) {
2207 unlock_page(page);
2208 return 0;
2209 }
2210 ret |= VM_FAULT_LOCKED;
2211 } else
2212 VM_BUG_ON_PAGE(!PageLocked(page), page);
2213 return ret;
2214}
2215
2216
2217
2218
2219
2220
2221static void fault_dirty_shared_page(struct vm_area_struct *vma,
2222 struct page *page)
2223{
2224 struct address_space *mapping;
2225 bool dirtied;
2226 bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;
2227
2228 dirtied = set_page_dirty(page);
2229 VM_BUG_ON_PAGE(PageAnon(page), page);
2230
2231
2232
2233
2234
2235
2236 mapping = page_rmapping(page);
2237 unlock_page(page);
2238
2239 if ((dirtied || page_mkwrite) && mapping) {
2240
2241
2242
2243
2244 balance_dirty_pages_ratelimited(mapping);
2245 }
2246
2247 if (!page_mkwrite)
2248 file_update_time(vma->vm_file);
2249}
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259static inline void wp_page_reuse(struct vm_fault *vmf)
2260 __releases(vmf->ptl)
2261{
2262 struct vm_area_struct *vma = vmf->vma;
2263 struct page *page = vmf->page;
2264 pte_t entry;
2265
2266
2267
2268
2269
2270 if (page)
2271 page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);
2272
2273 flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
2274 entry = pte_mkyoung(vmf->orig_pte);
2275 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2276 if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
2277 update_mmu_cache(vma, vmf->address, vmf->pte);
2278 pte_unmap_unlock(vmf->pte, vmf->ptl);
2279}
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297static vm_fault_t wp_page_copy(struct vm_fault *vmf)
2298{
2299 struct vm_area_struct *vma = vmf->vma;
2300 struct mm_struct *mm = vma->vm_mm;
2301 struct page *old_page = vmf->page;
2302 struct page *new_page = NULL;
2303 pte_t entry;
2304 int page_copied = 0;
2305 struct mem_cgroup *memcg;
2306 struct mmu_notifier_range range;
2307
2308 if (unlikely(anon_vma_prepare(vma)))
2309 goto oom;
2310
2311 if (is_zero_pfn(pte_pfn(vmf->orig_pte))) {
2312 new_page = alloc_zeroed_user_highpage_movable(vma,
2313 vmf->address);
2314 if (!new_page)
2315 goto oom;
2316 } else {
2317 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
2318 vmf->address);
2319 if (!new_page)
2320 goto oom;
2321 cow_user_page(new_page, old_page, vmf->address, vma);
2322 }
2323
2324 if (mem_cgroup_try_charge_delay(new_page, mm, GFP_KERNEL, &memcg, false))
2325 goto oom_free_new;
2326
2327 __SetPageUptodate(new_page);
2328
2329 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
2330 vmf->address & PAGE_MASK,
2331 (vmf->address & PAGE_MASK) + PAGE_SIZE);
2332 mmu_notifier_invalidate_range_start(&range);
2333
2334
2335
2336
2337 vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
2338 if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
2339 if (old_page) {
2340 if (!PageAnon(old_page)) {
2341 dec_mm_counter_fast(mm,
2342 mm_counter_file(old_page));
2343 inc_mm_counter_fast(mm, MM_ANONPAGES);
2344 }
2345 } else {
2346 inc_mm_counter_fast(mm, MM_ANONPAGES);
2347 }
2348 flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
2349 entry = mk_pte(new_page, vma->vm_page_prot);
2350 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2351
2352
2353
2354
2355
2356
2357 ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
2358 page_add_new_anon_rmap(new_page, vma, vmf->address, false);
2359 mem_cgroup_commit_charge(new_page, memcg, false, false);
2360 lru_cache_add_active_or_unevictable(new_page, vma);
2361
2362
2363
2364
2365
2366 set_pte_at_notify(mm, vmf->address, vmf->pte, entry);
2367 update_mmu_cache(vma, vmf->address, vmf->pte);
2368 if (old_page) {
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391 page_remove_rmap(old_page, false);
2392 }
2393
2394
2395 new_page = old_page;
2396 page_copied = 1;
2397 } else {
2398 mem_cgroup_cancel_charge(new_page, memcg, false);
2399 }
2400
2401 if (new_page)
2402 put_page(new_page);
2403
2404 pte_unmap_unlock(vmf->pte, vmf->ptl);
2405
2406
2407
2408
2409 mmu_notifier_invalidate_range_only_end(&range);
2410 if (old_page) {
2411
2412
2413
2414
2415 if (page_copied && (vma->vm_flags & VM_LOCKED)) {
2416 lock_page(old_page);
2417 if (PageMlocked(old_page))
2418 munlock_vma_page(old_page);
2419 unlock_page(old_page);
2420 }
2421 put_page(old_page);
2422 }
2423 return page_copied ? VM_FAULT_WRITE : 0;
2424oom_free_new:
2425 put_page(new_page);
2426oom:
2427 if (old_page)
2428 put_page(old_page);
2429 return VM_FAULT_OOM;
2430}
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf)
2449{
2450 WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
2451 vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
2452 &vmf->ptl);
2453
2454
2455
2456
2457 if (!pte_same(*vmf->pte, vmf->orig_pte)) {
2458 pte_unmap_unlock(vmf->pte, vmf->ptl);
2459 return VM_FAULT_NOPAGE;
2460 }
2461 wp_page_reuse(vmf);
2462 return 0;
2463}
2464
2465
2466
2467
2468
2469static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
2470{
2471 struct vm_area_struct *vma = vmf->vma;
2472
2473 if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
2474 vm_fault_t ret;
2475
2476 pte_unmap_unlock(vmf->pte, vmf->ptl);
2477 vmf->flags |= FAULT_FLAG_MKWRITE;
2478 ret = vma->vm_ops->pfn_mkwrite(vmf);
2479 if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
2480 return ret;
2481 return finish_mkwrite_fault(vmf);
2482 }
2483 wp_page_reuse(vmf);
2484 return VM_FAULT_WRITE;
2485}
2486
2487static vm_fault_t wp_page_shared(struct vm_fault *vmf)
2488 __releases(vmf->ptl)
2489{
2490 struct vm_area_struct *vma = vmf->vma;
2491
2492 get_page(vmf->page);
2493
2494 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
2495 vm_fault_t tmp;
2496
2497 pte_unmap_unlock(vmf->pte, vmf->ptl);
2498 tmp = do_page_mkwrite(vmf);
2499 if (unlikely(!tmp || (tmp &
2500 (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
2501 put_page(vmf->page);
2502 return tmp;
2503 }
2504 tmp = finish_mkwrite_fault(vmf);
2505 if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
2506 unlock_page(vmf->page);
2507 put_page(vmf->page);
2508 return tmp;
2509 }
2510 } else {
2511 wp_page_reuse(vmf);
2512 lock_page(vmf->page);
2513 }
2514 fault_dirty_shared_page(vma, vmf->page);
2515 put_page(vmf->page);
2516
2517 return VM_FAULT_WRITE;
2518}
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538static vm_fault_t do_wp_page(struct vm_fault *vmf)
2539 __releases(vmf->ptl)
2540{
2541 struct vm_area_struct *vma = vmf->vma;
2542
2543 vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
2544 if (!vmf->page) {
2545
2546
2547
2548
2549
2550
2551
2552 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2553 (VM_WRITE|VM_SHARED))
2554 return wp_pfn_shared(vmf);
2555
2556 pte_unmap_unlock(vmf->pte, vmf->ptl);
2557 return wp_page_copy(vmf);
2558 }
2559
2560
2561
2562
2563
2564 if (PageAnon(vmf->page)) {
2565 int total_map_swapcount;
2566 if (PageKsm(vmf->page) && (PageSwapCache(vmf->page) ||
2567 page_count(vmf->page) != 1))
2568 goto copy;
2569 if (!trylock_page(vmf->page)) {
2570 get_page(vmf->page);
2571 pte_unmap_unlock(vmf->pte, vmf->ptl);
2572 lock_page(vmf->page);
2573 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
2574 vmf->address, &vmf->ptl);
2575 if (!pte_same(*vmf->pte, vmf->orig_pte)) {
2576 unlock_page(vmf->page);
2577 pte_unmap_unlock(vmf->pte, vmf->ptl);
2578 put_page(vmf->page);
2579 return 0;
2580 }
2581 put_page(vmf->page);
2582 }
2583 if (PageKsm(vmf->page)) {
2584 bool reused = reuse_ksm_page(vmf->page, vmf->vma,
2585 vmf->address);
2586 unlock_page(vmf->page);
2587 if (!reused)
2588 goto copy;
2589 wp_page_reuse(vmf);
2590 return VM_FAULT_WRITE;
2591 }
2592 if (reuse_swap_page(vmf->page, &total_map_swapcount)) {
2593 if (total_map_swapcount == 1) {
2594
2595
2596
2597
2598
2599
2600
2601 page_move_anon_rmap(vmf->page, vma);
2602 }
2603 unlock_page(vmf->page);
2604 wp_page_reuse(vmf);
2605 return VM_FAULT_WRITE;
2606 }
2607 unlock_page(vmf->page);
2608 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2609 (VM_WRITE|VM_SHARED))) {
2610 return wp_page_shared(vmf);
2611 }
2612copy:
2613
2614
2615
2616 get_page(vmf->page);
2617
2618 pte_unmap_unlock(vmf->pte, vmf->ptl);
2619 return wp_page_copy(vmf);
2620}
2621
2622static void unmap_mapping_range_vma(struct vm_area_struct *vma,
2623 unsigned long start_addr, unsigned long end_addr,
2624 struct zap_details *details)
2625{
2626 zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
2627}
2628
2629static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
2630 struct zap_details *details)
2631{
2632 struct vm_area_struct *vma;
2633 pgoff_t vba, vea, zba, zea;
2634
2635 vma_interval_tree_foreach(vma, root,
2636 details->first_index, details->last_index) {
2637
2638 vba = vma->vm_pgoff;
2639 vea = vba + vma_pages(vma) - 1;
2640 zba = details->first_index;
2641 if (zba < vba)
2642 zba = vba;
2643 zea = details->last_index;
2644 if (zea > vea)
2645 zea = vea;
2646
2647 unmap_mapping_range_vma(vma,
2648 ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
2649 ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
2650 details);
2651 }
2652}
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666void unmap_mapping_pages(struct address_space *mapping, pgoff_t start,
2667 pgoff_t nr, bool even_cows)
2668{
2669 struct zap_details details = { };
2670
2671 details.check_mapping = even_cows ? NULL : mapping;
2672 details.first_index = start;
2673 details.last_index = start + nr - 1;
2674 if (details.last_index < details.first_index)
2675 details.last_index = ULONG_MAX;
2676
2677 i_mmap_lock_write(mapping);
2678 if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
2679 unmap_mapping_range_tree(&mapping->i_mmap, &details);
2680 i_mmap_unlock_write(mapping);
2681}
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700void unmap_mapping_range(struct address_space *mapping,
2701 loff_t const holebegin, loff_t const holelen, int even_cows)
2702{
2703 pgoff_t hba = holebegin >> PAGE_SHIFT;
2704 pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2705
2706
2707 if (sizeof(holelen) > sizeof(hlen)) {
2708 long long holeend =
2709 (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2710 if (holeend & ~(long long)ULONG_MAX)
2711 hlen = ULONG_MAX - hba + 1;
2712 }
2713
2714 unmap_mapping_pages(mapping, hba, hlen, even_cows);
2715}
2716EXPORT_SYMBOL(unmap_mapping_range);
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726vm_fault_t do_swap_page(struct vm_fault *vmf)
2727{
2728 struct vm_area_struct *vma = vmf->vma;
2729 struct page *page = NULL, *swapcache;
2730 struct mem_cgroup *memcg;
2731 swp_entry_t entry;
2732 pte_t pte;
2733 int locked;
2734 int exclusive = 0;
2735 vm_fault_t ret = 0;
2736
2737 if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
2738 goto out;
2739
2740 entry = pte_to_swp_entry(vmf->orig_pte);
2741 if (unlikely(non_swap_entry(entry))) {
2742 if (is_migration_entry(entry)) {
2743 migration_entry_wait(vma->vm_mm, vmf->pmd,
2744 vmf->address);
2745 } else if (is_device_private_entry(entry)) {
2746 vmf->page = device_private_entry_to_page(entry);
2747 ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
2748 } else if (is_hwpoison_entry(entry)) {
2749 ret = VM_FAULT_HWPOISON;
2750 } else {
2751 print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
2752 ret = VM_FAULT_SIGBUS;
2753 }
2754 goto out;
2755 }
2756
2757
2758 delayacct_set_flag(DELAYACCT_PF_SWAPIN);
2759 page = lookup_swap_cache(entry, vma, vmf->address);
2760 swapcache = page;
2761
2762 if (!page) {
2763 struct swap_info_struct *si = swp_swap_info(entry);
2764
2765 if (si->flags & SWP_SYNCHRONOUS_IO &&
2766 __swap_count(entry) == 1) {
2767
2768 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
2769 vmf->address);
2770 if (page) {
2771 __SetPageLocked(page);
2772 __SetPageSwapBacked(page);
2773 set_page_private(page, entry.val);
2774 lru_cache_add_anon(page);
2775 swap_readpage(page, true);
2776 }
2777 } else {
2778 page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
2779 vmf);
2780 swapcache = page;
2781 }
2782
2783 if (!page) {
2784
2785
2786
2787
2788 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
2789 vmf->address, &vmf->ptl);
2790 if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
2791 ret = VM_FAULT_OOM;
2792 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2793 goto unlock;
2794 }
2795
2796
2797 ret = VM_FAULT_MAJOR;
2798 count_vm_event(PGMAJFAULT);
2799 count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
2800 } else if (PageHWPoison(page)) {
2801
2802
2803
2804
2805 ret = VM_FAULT_HWPOISON;
2806 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2807 goto out_release;
2808 }
2809
2810 locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
2811
2812 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2813 if (!locked) {
2814 ret |= VM_FAULT_RETRY;
2815 goto out_release;
2816 }
2817
2818
2819
2820
2821
2822
2823
2824 if (unlikely((!PageSwapCache(page) ||
2825 page_private(page) != entry.val)) && swapcache)
2826 goto out_page;
2827
2828 page = ksm_might_need_to_copy(page, vma, vmf->address);
2829 if (unlikely(!page)) {
2830 ret = VM_FAULT_OOM;
2831 page = swapcache;
2832 goto out_page;
2833 }
2834
2835 if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL,
2836 &memcg, false)) {
2837 ret = VM_FAULT_OOM;
2838 goto out_page;
2839 }
2840
2841
2842
2843
2844 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
2845 &vmf->ptl);
2846 if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte)))
2847 goto out_nomap;
2848
2849 if (unlikely(!PageUptodate(page))) {
2850 ret = VM_FAULT_SIGBUS;
2851 goto out_nomap;
2852 }
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
2865 dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
2866 pte = mk_pte(page, vma->vm_page_prot);
2867 if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
2868 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
2869 vmf->flags &= ~FAULT_FLAG_WRITE;
2870 ret |= VM_FAULT_WRITE;
2871 exclusive = RMAP_EXCLUSIVE;
2872 }
2873 flush_icache_page(vma, page);
2874 if (pte_swp_soft_dirty(vmf->orig_pte))
2875 pte = pte_mksoft_dirty(pte);
2876 set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
2877 arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
2878 vmf->orig_pte = pte;
2879
2880
2881 if (unlikely(page != swapcache && swapcache)) {
2882 page_add_new_anon_rmap(page, vma, vmf->address, false);
2883 mem_cgroup_commit_charge(page, memcg, false, false);
2884 lru_cache_add_active_or_unevictable(page, vma);
2885 } else {
2886 do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
2887 mem_cgroup_commit_charge(page, memcg, true, false);
2888 activate_page(page);
2889 }
2890
2891 swap_free(entry);
2892 if (mem_cgroup_swap_full(page) ||
2893 (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
2894 try_to_free_swap(page);
2895 unlock_page(page);
2896 if (page != swapcache && swapcache) {
2897
2898
2899
2900
2901
2902
2903
2904
2905 unlock_page(swapcache);
2906 put_page(swapcache);
2907 }
2908
2909 if (vmf->flags & FAULT_FLAG_WRITE) {
2910 ret |= do_wp_page(vmf);
2911 if (ret & VM_FAULT_ERROR)
2912 ret &= VM_FAULT_ERROR;
2913 goto out;
2914 }
2915
2916
2917 update_mmu_cache(vma, vmf->address, vmf->pte);
2918unlock:
2919 pte_unmap_unlock(vmf->pte, vmf->ptl);
2920out:
2921 return ret;
2922out_nomap:
2923 mem_cgroup_cancel_charge(page, memcg, false);
2924 pte_unmap_unlock(vmf->pte, vmf->ptl);
2925out_page:
2926 unlock_page(page);
2927out_release:
2928 put_page(page);
2929 if (page != swapcache && swapcache) {
2930 unlock_page(swapcache);
2931 put_page(swapcache);
2932 }
2933 return ret;
2934}
2935
2936
2937
2938
2939
2940
2941static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
2942{
2943 struct vm_area_struct *vma = vmf->vma;
2944 struct mem_cgroup *memcg;
2945 struct page *page;
2946 vm_fault_t ret = 0;
2947 pte_t entry;
2948
2949
2950 if (vma->vm_flags & VM_SHARED)
2951 return VM_FAULT_SIGBUS;
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963 if (pte_alloc(vma->vm_mm, vmf->pmd))
2964 return VM_FAULT_OOM;
2965
2966
2967 if (unlikely(pmd_trans_unstable(vmf->pmd)))
2968 return 0;
2969
2970
2971 if (!(vmf->flags & FAULT_FLAG_WRITE) &&
2972 !mm_forbids_zeropage(vma->vm_mm)) {
2973 entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address),
2974 vma->vm_page_prot));
2975 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
2976 vmf->address, &vmf->ptl);
2977 if (!pte_none(*vmf->pte))
2978 goto unlock;
2979 ret = check_stable_address_space(vma->vm_mm);
2980 if (ret)
2981 goto unlock;
2982
2983 if (userfaultfd_missing(vma)) {
2984 pte_unmap_unlock(vmf->pte, vmf->ptl);
2985 return handle_userfault(vmf, VM_UFFD_MISSING);
2986 }
2987 goto setpte;
2988 }
2989
2990
2991 if (unlikely(anon_vma_prepare(vma)))
2992 goto oom;
2993 page = alloc_zeroed_user_highpage_movable(vma, vmf->address);
2994 if (!page)
2995 goto oom;
2996
2997 if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL, &memcg,
2998 false))
2999 goto oom_free_page;
3000
3001
3002
3003
3004
3005
3006 __SetPageUptodate(page);
3007
3008 entry = mk_pte(page, vma->vm_page_prot);
3009 if (vma->vm_flags & VM_WRITE)
3010 entry = pte_mkwrite(pte_mkdirty(entry));
3011
3012 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
3013 &vmf->ptl);
3014 if (!pte_none(*vmf->pte))
3015 goto release;
3016
3017 ret = check_stable_address_space(vma->vm_mm);
3018 if (ret)
3019 goto release;
3020
3021
3022 if (userfaultfd_missing(vma)) {
3023 pte_unmap_unlock(vmf->pte, vmf->ptl);
3024 mem_cgroup_cancel_charge(page, memcg, false);
3025 put_page(page);
3026 return handle_userfault(vmf, VM_UFFD_MISSING);
3027 }
3028
3029 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
3030 page_add_new_anon_rmap(page, vma, vmf->address, false);
3031 mem_cgroup_commit_charge(page, memcg, false, false);
3032 lru_cache_add_active_or_unevictable(page, vma);
3033setpte:
3034 set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
3035
3036
3037 update_mmu_cache(vma, vmf->address, vmf->pte);
3038unlock:
3039 pte_unmap_unlock(vmf->pte, vmf->ptl);
3040 return ret;
3041release:
3042 mem_cgroup_cancel_charge(page, memcg, false);
3043 put_page(page);
3044 goto unlock;
3045oom_free_page:
3046 put_page(page);
3047oom:
3048 return VM_FAULT_OOM;
3049}
3050
3051
3052
3053
3054
3055
3056static vm_fault_t __do_fault(struct vm_fault *vmf)
3057{
3058 struct vm_area_struct *vma = vmf->vma;
3059 vm_fault_t ret;
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076 if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) {
3077 vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
3078 if (!vmf->prealloc_pte)
3079 return VM_FAULT_OOM;
3080 smp_wmb();
3081 }
3082
3083 ret = vma->vm_ops->fault(vmf);
3084 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
3085 VM_FAULT_DONE_COW)))
3086 return ret;
3087
3088 if (unlikely(PageHWPoison(vmf->page))) {
3089 if (ret & VM_FAULT_LOCKED)
3090 unlock_page(vmf->page);
3091 put_page(vmf->page);
3092 vmf->page = NULL;
3093 return VM_FAULT_HWPOISON;
3094 }
3095
3096 if (unlikely(!(ret & VM_FAULT_LOCKED)))
3097 lock_page(vmf->page);
3098 else
3099 VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page);
3100
3101 return ret;
3102}
3103
3104
3105
3106
3107
3108
3109
3110static int pmd_devmap_trans_unstable(pmd_t *pmd)
3111{
3112 return pmd_devmap(*pmd) || pmd_trans_unstable(pmd);
3113}
3114
3115static vm_fault_t pte_alloc_one_map(struct vm_fault *vmf)
3116{
3117 struct vm_area_struct *vma = vmf->vma;
3118
3119 if (!pmd_none(*vmf->pmd))
3120 goto map_pte;
3121 if (vmf->prealloc_pte) {
3122 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
3123 if (unlikely(!pmd_none(*vmf->pmd))) {
3124 spin_unlock(vmf->ptl);
3125 goto map_pte;
3126 }
3127
3128 mm_inc_nr_ptes(vma->vm_mm);
3129 pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
3130 spin_unlock(vmf->ptl);
3131 vmf->prealloc_pte = NULL;
3132 } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) {
3133 return VM_FAULT_OOM;
3134 }
3135map_pte:
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147 if (pmd_devmap_trans_unstable(vmf->pmd))
3148 return VM_FAULT_NOPAGE;
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
3160 &vmf->ptl);
3161 return 0;
3162}
3163
3164#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
3165static void deposit_prealloc_pte(struct vm_fault *vmf)
3166{
3167 struct vm_area_struct *vma = vmf->vma;
3168
3169 pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
3170
3171
3172
3173
3174 mm_inc_nr_ptes(vma->vm_mm);
3175 vmf->prealloc_pte = NULL;
3176}
3177
3178static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
3179{
3180 struct vm_area_struct *vma = vmf->vma;
3181 bool write = vmf->flags & FAULT_FLAG_WRITE;
3182 unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
3183 pmd_t entry;
3184 int i;
3185 vm_fault_t ret;
3186
3187 if (!transhuge_vma_suitable(vma, haddr))
3188 return VM_FAULT_FALLBACK;
3189
3190 ret = VM_FAULT_FALLBACK;
3191 page = compound_head(page);
3192
3193
3194
3195
3196
3197 if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
3198 vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
3199 if (!vmf->prealloc_pte)
3200 return VM_FAULT_OOM;
3201 smp_wmb();
3202 }
3203
3204 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
3205 if (unlikely(!pmd_none(*vmf->pmd)))
3206 goto out;
3207
3208 for (i = 0; i < HPAGE_PMD_NR; i++)
3209 flush_icache_page(vma, page + i);
3210
3211 entry = mk_huge_pmd(page, vma->vm_page_prot);
3212 if (write)
3213 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
3214
3215 add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR);
3216 page_add_file_rmap(page, true);
3217
3218
3219
3220 if (arch_needs_pgtable_deposit())
3221 deposit_prealloc_pte(vmf);
3222
3223 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
3224
3225 update_mmu_cache_pmd(vma, haddr, vmf->pmd);
3226
3227
3228 ret = 0;
3229 count_vm_event(THP_FILE_MAPPED);
3230out:
3231 spin_unlock(vmf->ptl);
3232 return ret;
3233}
3234#else
3235static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
3236{
3237 BUILD_BUG();
3238 return 0;
3239}
3240#endif
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
3259 struct page *page)
3260{
3261 struct vm_area_struct *vma = vmf->vma;
3262 bool write = vmf->flags & FAULT_FLAG_WRITE;
3263 pte_t entry;
3264 vm_fault_t ret;
3265
3266 if (pmd_none(*vmf->pmd) && PageTransCompound(page) &&
3267 IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) {
3268
3269 VM_BUG_ON_PAGE(memcg, page);
3270
3271 ret = do_set_pmd(vmf, page);
3272 if (ret != VM_FAULT_FALLBACK)
3273 return ret;
3274 }
3275
3276 if (!vmf->pte) {
3277 ret = pte_alloc_one_map(vmf);
3278 if (ret)
3279 return ret;
3280 }
3281
3282
3283 if (unlikely(!pte_none(*vmf->pte)))
3284 return VM_FAULT_NOPAGE;
3285
3286 flush_icache_page(vma, page);
3287 entry = mk_pte(page, vma->vm_page_prot);
3288 if (write)
3289 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
3290
3291 if (write && !(vma->vm_flags & VM_SHARED)) {
3292 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
3293 page_add_new_anon_rmap(page, vma, vmf->address, false);
3294 mem_cgroup_commit_charge(page, memcg, false, false);
3295 lru_cache_add_active_or_unevictable(page, vma);
3296 } else {
3297 inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
3298 page_add_file_rmap(page, false);
3299 }
3300 set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
3301
3302
3303 update_mmu_cache(vma, vmf->address, vmf->pte);
3304
3305 return 0;
3306}
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324vm_fault_t finish_fault(struct vm_fault *vmf)
3325{
3326 struct page *page;
3327 vm_fault_t ret = 0;
3328
3329
3330 if ((vmf->flags & FAULT_FLAG_WRITE) &&
3331 !(vmf->vma->vm_flags & VM_SHARED))
3332 page = vmf->cow_page;
3333 else
3334 page = vmf->page;
3335
3336
3337
3338
3339
3340 if (!(vmf->vma->vm_flags & VM_SHARED))
3341 ret = check_stable_address_space(vmf->vma->vm_mm);
3342 if (!ret)
3343 ret = alloc_set_pte(vmf, vmf->memcg, page);
3344 if (vmf->pte)
3345 pte_unmap_unlock(vmf->pte, vmf->ptl);
3346 return ret;
3347}
3348
3349static unsigned long fault_around_bytes __read_mostly =
3350 rounddown_pow_of_two(65536);
3351
3352#ifdef CONFIG_DEBUG_FS
3353static int fault_around_bytes_get(void *data, u64 *val)
3354{
3355 *val = fault_around_bytes;
3356 return 0;
3357}
3358
3359
3360
3361
3362
3363static int fault_around_bytes_set(void *data, u64 val)
3364{
3365 if (val / PAGE_SIZE > PTRS_PER_PTE)
3366 return -EINVAL;
3367 if (val > PAGE_SIZE)
3368 fault_around_bytes = rounddown_pow_of_two(val);
3369 else
3370 fault_around_bytes = PAGE_SIZE;
3371 return 0;
3372}
3373DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops,
3374 fault_around_bytes_get, fault_around_bytes_set, "%llu\n");
3375
3376static int __init fault_around_debugfs(void)
3377{
3378 debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL,
3379 &fault_around_bytes_fops);
3380 return 0;
3381}
3382late_initcall(fault_around_debugfs);
3383#endif
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409static vm_fault_t do_fault_around(struct vm_fault *vmf)
3410{
3411 unsigned long address = vmf->address, nr_pages, mask;
3412 pgoff_t start_pgoff = vmf->pgoff;
3413 pgoff_t end_pgoff;
3414 int off;
3415 vm_fault_t ret = 0;
3416
3417 nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
3418 mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
3419
3420 vmf->address = max(address & mask, vmf->vma->vm_start);
3421 off = ((address - vmf->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
3422 start_pgoff -= off;
3423
3424
3425
3426
3427
3428 end_pgoff = start_pgoff -
3429 ((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
3430 PTRS_PER_PTE - 1;
3431 end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1,
3432 start_pgoff + nr_pages - 1);
3433
3434 if (pmd_none(*vmf->pmd)) {
3435 vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
3436 if (!vmf->prealloc_pte)
3437 goto out;
3438 smp_wmb();
3439 }
3440
3441 vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);
3442
3443
3444 if (pmd_trans_huge(*vmf->pmd)) {
3445 ret = VM_FAULT_NOPAGE;
3446 goto out;
3447 }
3448
3449
3450 if (!vmf->pte)
3451 goto out;
3452
3453
3454 vmf->pte -= (vmf->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT);
3455 if (!pte_none(*vmf->pte))
3456 ret = VM_FAULT_NOPAGE;
3457 pte_unmap_unlock(vmf->pte, vmf->ptl);
3458out:
3459 vmf->address = address;
3460 vmf->pte = NULL;
3461 return ret;
3462}
3463
3464static vm_fault_t do_read_fault(struct vm_fault *vmf)
3465{
3466 struct vm_area_struct *vma = vmf->vma;
3467 vm_fault_t ret = 0;
3468
3469
3470
3471
3472
3473
3474 if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
3475 ret = do_fault_around(vmf);
3476 if (ret)
3477 return ret;
3478 }
3479
3480 ret = __do_fault(vmf);
3481 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3482 return ret;
3483
3484 ret |= finish_fault(vmf);
3485 unlock_page(vmf->page);
3486 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3487 put_page(vmf->page);
3488 return ret;
3489}
3490
3491static vm_fault_t do_cow_fault(struct vm_fault *vmf)
3492{
3493 struct vm_area_struct *vma = vmf->vma;
3494 vm_fault_t ret;
3495
3496 if (unlikely(anon_vma_prepare(vma)))
3497 return VM_FAULT_OOM;
3498
3499 vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address);
3500 if (!vmf->cow_page)
3501 return VM_FAULT_OOM;
3502
3503 if (mem_cgroup_try_charge_delay(vmf->cow_page, vma->vm_mm, GFP_KERNEL,
3504 &vmf->memcg, false)) {
3505 put_page(vmf->cow_page);
3506 return VM_FAULT_OOM;
3507 }
3508
3509 ret = __do_fault(vmf);
3510 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3511 goto uncharge_out;
3512 if (ret & VM_FAULT_DONE_COW)
3513 return ret;
3514
3515 copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma);
3516 __SetPageUptodate(vmf->cow_page);
3517
3518 ret |= finish_fault(vmf);
3519 unlock_page(vmf->page);
3520 put_page(vmf->page);
3521 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3522 goto uncharge_out;
3523 return ret;
3524uncharge_out:
3525 mem_cgroup_cancel_charge(vmf->cow_page, vmf->memcg, false);
3526 put_page(vmf->cow_page);
3527 return ret;
3528}
3529
3530static vm_fault_t do_shared_fault(struct vm_fault *vmf)
3531{
3532 struct vm_area_struct *vma = vmf->vma;
3533 vm_fault_t ret, tmp;
3534
3535 ret = __do_fault(vmf);
3536 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3537 return ret;
3538
3539
3540
3541
3542
3543 if (vma->vm_ops->page_mkwrite) {
3544 unlock_page(vmf->page);
3545 tmp = do_page_mkwrite(vmf);
3546 if (unlikely(!tmp ||
3547 (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
3548 put_page(vmf->page);
3549 return tmp;
3550 }
3551 }
3552
3553 ret |= finish_fault(vmf);
3554 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
3555 VM_FAULT_RETRY))) {
3556 unlock_page(vmf->page);
3557 put_page(vmf->page);
3558 return ret;
3559 }
3560
3561 fault_dirty_shared_page(vma, vmf->page);
3562 return ret;
3563}
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573static vm_fault_t do_fault(struct vm_fault *vmf)
3574{
3575 struct vm_area_struct *vma = vmf->vma;
3576 struct mm_struct *vm_mm = vma->vm_mm;
3577 vm_fault_t ret;
3578
3579
3580
3581
3582 if (!vma->vm_ops->fault) {
3583
3584
3585
3586
3587 if (unlikely(!pmd_present(*vmf->pmd)))
3588 ret = VM_FAULT_SIGBUS;
3589 else {
3590 vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm,
3591 vmf->pmd,
3592 vmf->address,
3593 &vmf->ptl);
3594
3595
3596
3597
3598
3599
3600
3601 if (unlikely(pte_none(*vmf->pte)))
3602 ret = VM_FAULT_SIGBUS;
3603 else
3604 ret = VM_FAULT_NOPAGE;
3605
3606 pte_unmap_unlock(vmf->pte, vmf->ptl);
3607 }
3608 } else if (!(vmf->flags & FAULT_FLAG_WRITE))
3609 ret = do_read_fault(vmf);
3610 else if (!(vma->vm_flags & VM_SHARED))
3611 ret = do_cow_fault(vmf);
3612 else
3613 ret = do_shared_fault(vmf);
3614
3615
3616 if (vmf->prealloc_pte) {
3617 pte_free(vm_mm, vmf->prealloc_pte);
3618 vmf->prealloc_pte = NULL;
3619 }
3620 return ret;
3621}
3622
3623static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
3624 unsigned long addr, int page_nid,
3625 int *flags)
3626{
3627 get_page(page);
3628
3629 count_vm_numa_event(NUMA_HINT_FAULTS);
3630 if (page_nid == numa_node_id()) {
3631 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
3632 *flags |= TNF_FAULT_LOCAL;
3633 }
3634
3635 return mpol_misplaced(page, vma, addr);
3636}
3637
3638static vm_fault_t do_numa_page(struct vm_fault *vmf)
3639{
3640 struct vm_area_struct *vma = vmf->vma;
3641 struct page *page = NULL;
3642 int page_nid = NUMA_NO_NODE;
3643 int last_cpupid;
3644 int target_nid;
3645 bool migrated = false;
3646 pte_t pte, old_pte;
3647 bool was_writable = pte_savedwrite(vmf->orig_pte);
3648 int flags = 0;
3649
3650
3651
3652
3653
3654
3655 vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd);
3656 spin_lock(vmf->ptl);
3657 if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
3658 pte_unmap_unlock(vmf->pte, vmf->ptl);
3659 goto out;
3660 }
3661
3662
3663
3664
3665
3666 old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte);
3667 pte = pte_modify(old_pte, vma->vm_page_prot);
3668 pte = pte_mkyoung(pte);
3669 if (was_writable)
3670 pte = pte_mkwrite(pte);
3671 ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte);
3672 update_mmu_cache(vma, vmf->address, vmf->pte);
3673
3674 page = vm_normal_page(vma, vmf->address, pte);
3675 if (!page) {
3676 pte_unmap_unlock(vmf->pte, vmf->ptl);
3677 return 0;
3678 }
3679
3680
3681 if (PageCompound(page)) {
3682 pte_unmap_unlock(vmf->pte, vmf->ptl);
3683 return 0;
3684 }
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694 if (!pte_write(pte))
3695 flags |= TNF_NO_GROUP;
3696
3697
3698
3699
3700
3701 if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
3702 flags |= TNF_SHARED;
3703
3704 last_cpupid = page_cpupid_last(page);
3705 page_nid = page_to_nid(page);
3706 target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
3707 &flags);
3708 pte_unmap_unlock(vmf->pte, vmf->ptl);
3709 if (target_nid == NUMA_NO_NODE) {
3710 put_page(page);
3711 goto out;
3712 }
3713
3714
3715 migrated = migrate_misplaced_page(page, vma, target_nid);
3716 if (migrated) {
3717 page_nid = target_nid;
3718 flags |= TNF_MIGRATED;
3719 } else
3720 flags |= TNF_MIGRATE_FAIL;
3721
3722out:
3723 if (page_nid != NUMA_NO_NODE)
3724 task_numa_fault(last_cpupid, page_nid, 1, flags);
3725 return 0;
3726}
3727
3728static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
3729{
3730 if (vma_is_anonymous(vmf->vma))
3731 return do_huge_pmd_anonymous_page(vmf);
3732 if (vmf->vma->vm_ops->huge_fault)
3733 return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
3734 return VM_FAULT_FALLBACK;
3735}
3736
3737
3738static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
3739{
3740 if (vma_is_anonymous(vmf->vma))
3741 return do_huge_pmd_wp_page(vmf, orig_pmd);
3742 if (vmf->vma->vm_ops->huge_fault)
3743 return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
3744
3745
3746 VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);
3747 __split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);
3748
3749 return VM_FAULT_FALLBACK;
3750}
3751
3752static inline bool vma_is_accessible(struct vm_area_struct *vma)
3753{
3754 return vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE);
3755}
3756
3757static vm_fault_t create_huge_pud(struct vm_fault *vmf)
3758{
3759#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3760
3761 if (vma_is_anonymous(vmf->vma))
3762 return VM_FAULT_FALLBACK;
3763 if (vmf->vma->vm_ops->huge_fault)
3764 return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
3765#endif
3766 return VM_FAULT_FALLBACK;
3767}
3768
3769static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
3770{
3771#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3772
3773 if (vma_is_anonymous(vmf->vma))
3774 return VM_FAULT_FALLBACK;
3775 if (vmf->vma->vm_ops->huge_fault)
3776 return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
3777#endif
3778 return VM_FAULT_FALLBACK;
3779}
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
3797{
3798 pte_t entry;
3799
3800 if (unlikely(pmd_none(*vmf->pmd))) {
3801
3802
3803
3804
3805
3806
3807 vmf->pte = NULL;
3808 } else {
3809
3810 if (pmd_devmap_trans_unstable(vmf->pmd))
3811 return 0;
3812
3813
3814
3815
3816
3817
3818 vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
3819 vmf->orig_pte = *vmf->pte;
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829 barrier();
3830 if (pte_none(vmf->orig_pte)) {
3831 pte_unmap(vmf->pte);
3832 vmf->pte = NULL;
3833 }
3834 }
3835
3836 if (!vmf->pte) {
3837 if (vma_is_anonymous(vmf->vma))
3838 return do_anonymous_page(vmf);
3839 else
3840 return do_fault(vmf);
3841 }
3842
3843 if (!pte_present(vmf->orig_pte))
3844 return do_swap_page(vmf);
3845
3846 if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
3847 return do_numa_page(vmf);
3848
3849 vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
3850 spin_lock(vmf->ptl);
3851 entry = vmf->orig_pte;
3852 if (unlikely(!pte_same(*vmf->pte, entry)))
3853 goto unlock;
3854 if (vmf->flags & FAULT_FLAG_WRITE) {
3855 if (!pte_write(entry))
3856 return do_wp_page(vmf);
3857 entry = pte_mkdirty(entry);
3858 }
3859 entry = pte_mkyoung(entry);
3860 if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
3861 vmf->flags & FAULT_FLAG_WRITE)) {
3862 update_mmu_cache(vmf->vma, vmf->address, vmf->pte);
3863 } else {
3864
3865
3866
3867
3868
3869
3870 if (vmf->flags & FAULT_FLAG_WRITE)
3871 flush_tlb_fix_spurious_fault(vmf->vma, vmf->address);
3872 }
3873unlock:
3874 pte_unmap_unlock(vmf->pte, vmf->ptl);
3875 return 0;
3876}
3877
3878
3879
3880
3881
3882
3883
3884static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
3885 unsigned long address, unsigned int flags)
3886{
3887 struct vm_fault vmf = {
3888 .vma = vma,
3889 .address = address & PAGE_MASK,
3890 .flags = flags,
3891 .pgoff = linear_page_index(vma, address),
3892 .gfp_mask = __get_fault_gfp_mask(vma),
3893 };
3894 unsigned int dirty = flags & FAULT_FLAG_WRITE;
3895 struct mm_struct *mm = vma->vm_mm;
3896 pgd_t *pgd;
3897 p4d_t *p4d;
3898 vm_fault_t ret;
3899
3900 pgd = pgd_offset(mm, address);
3901 p4d = p4d_alloc(mm, pgd, address);
3902 if (!p4d)
3903 return VM_FAULT_OOM;
3904
3905 vmf.pud = pud_alloc(mm, p4d, address);
3906 if (!vmf.pud)
3907 return VM_FAULT_OOM;
3908 if (pud_none(*vmf.pud) && __transparent_hugepage_enabled(vma)) {
3909 ret = create_huge_pud(&vmf);
3910 if (!(ret & VM_FAULT_FALLBACK))
3911 return ret;
3912 } else {
3913 pud_t orig_pud = *vmf.pud;
3914
3915 barrier();
3916 if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {
3917
3918
3919
3920 if (dirty && !pud_write(orig_pud)) {
3921 ret = wp_huge_pud(&vmf, orig_pud);
3922 if (!(ret & VM_FAULT_FALLBACK))
3923 return ret;
3924 } else {
3925 huge_pud_set_accessed(&vmf, orig_pud);
3926 return 0;
3927 }
3928 }
3929 }
3930
3931 vmf.pmd = pmd_alloc(mm, vmf.pud, address);
3932 if (!vmf.pmd)
3933 return VM_FAULT_OOM;
3934 if (pmd_none(*vmf.pmd) && __transparent_hugepage_enabled(vma)) {
3935 ret = create_huge_pmd(&vmf);
3936 if (!(ret & VM_FAULT_FALLBACK))
3937 return ret;
3938 } else {
3939 pmd_t orig_pmd = *vmf.pmd;
3940
3941 barrier();
3942 if (unlikely(is_swap_pmd(orig_pmd))) {
3943 VM_BUG_ON(thp_migration_supported() &&
3944 !is_pmd_migration_entry(orig_pmd));
3945 if (is_pmd_migration_entry(orig_pmd))
3946 pmd_migration_entry_wait(mm, vmf.pmd);
3947 return 0;
3948 }
3949 if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
3950 if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
3951 return do_huge_pmd_numa_page(&vmf, orig_pmd);
3952
3953 if (dirty && !pmd_write(orig_pmd)) {
3954 ret = wp_huge_pmd(&vmf, orig_pmd);
3955 if (!(ret & VM_FAULT_FALLBACK))
3956 return ret;
3957 } else {
3958 huge_pmd_set_accessed(&vmf, orig_pmd);
3959 return 0;
3960 }
3961 }
3962 }
3963
3964 return handle_pte_fault(&vmf);
3965}
3966
3967
3968
3969
3970
3971
3972
3973vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
3974 unsigned int flags)
3975{
3976 vm_fault_t ret;
3977
3978 __set_current_state(TASK_RUNNING);
3979
3980 count_vm_event(PGFAULT);
3981 count_memcg_event_mm(vma->vm_mm, PGFAULT);
3982
3983
3984 check_sync_rss_stat(current);
3985
3986 if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
3987 flags & FAULT_FLAG_INSTRUCTION,
3988 flags & FAULT_FLAG_REMOTE))
3989 return VM_FAULT_SIGSEGV;
3990
3991
3992
3993
3994
3995 if (flags & FAULT_FLAG_USER)
3996 mem_cgroup_enter_user_fault();
3997
3998 if (unlikely(is_vm_hugetlb_page(vma)))
3999 ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
4000 else
4001 ret = __handle_mm_fault(vma, address, flags);
4002
4003 if (flags & FAULT_FLAG_USER) {
4004 mem_cgroup_exit_user_fault();
4005
4006
4007
4008
4009
4010
4011 if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
4012 mem_cgroup_oom_synchronize(false);
4013 }
4014
4015 return ret;
4016}
4017EXPORT_SYMBOL_GPL(handle_mm_fault);
4018
4019#ifndef __PAGETABLE_P4D_FOLDED
4020
4021
4022
4023
4024int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
4025{
4026 p4d_t *new = p4d_alloc_one(mm, address);
4027 if (!new)
4028 return -ENOMEM;
4029
4030 smp_wmb();
4031
4032 spin_lock(&mm->page_table_lock);
4033 if (pgd_present(*pgd))
4034 p4d_free(mm, new);
4035 else
4036 pgd_populate(mm, pgd, new);
4037 spin_unlock(&mm->page_table_lock);
4038 return 0;
4039}
4040#endif
4041
4042#ifndef __PAGETABLE_PUD_FOLDED
4043
4044
4045
4046
4047int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address)
4048{
4049 pud_t *new = pud_alloc_one(mm, address);
4050 if (!new)
4051 return -ENOMEM;
4052
4053 smp_wmb();
4054
4055 spin_lock(&mm->page_table_lock);
4056#ifndef __ARCH_HAS_5LEVEL_HACK
4057 if (!p4d_present(*p4d)) {
4058 mm_inc_nr_puds(mm);
4059 p4d_populate(mm, p4d, new);
4060 } else
4061 pud_free(mm, new);
4062#else
4063 if (!pgd_present(*p4d)) {
4064 mm_inc_nr_puds(mm);
4065 pgd_populate(mm, p4d, new);
4066 } else
4067 pud_free(mm, new);
4068#endif
4069 spin_unlock(&mm->page_table_lock);
4070 return 0;
4071}
4072#endif
4073
4074#ifndef __PAGETABLE_PMD_FOLDED
4075
4076
4077
4078
4079int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
4080{
4081 spinlock_t *ptl;
4082 pmd_t *new = pmd_alloc_one(mm, address);
4083 if (!new)
4084 return -ENOMEM;
4085
4086 smp_wmb();
4087
4088 ptl = pud_lock(mm, pud);
4089#ifndef __ARCH_HAS_4LEVEL_HACK
4090 if (!pud_present(*pud)) {
4091 mm_inc_nr_pmds(mm);
4092 pud_populate(mm, pud, new);
4093 } else
4094 pmd_free(mm, new);
4095#else
4096 if (!pgd_present(*pud)) {
4097 mm_inc_nr_pmds(mm);
4098 pgd_populate(mm, pud, new);
4099 } else
4100 pmd_free(mm, new);
4101#endif
4102 spin_unlock(ptl);
4103 return 0;
4104}
4105#endif
4106
4107static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
4108 struct mmu_notifier_range *range,
4109 pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
4110{
4111 pgd_t *pgd;
4112 p4d_t *p4d;
4113 pud_t *pud;
4114 pmd_t *pmd;
4115 pte_t *ptep;
4116
4117 pgd = pgd_offset(mm, address);
4118 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
4119 goto out;
4120
4121 p4d = p4d_offset(pgd, address);
4122 if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d)))
4123 goto out;
4124
4125 pud = pud_offset(p4d, address);
4126 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
4127 goto out;
4128
4129 pmd = pmd_offset(pud, address);
4130 VM_BUG_ON(pmd_trans_huge(*pmd));
4131
4132 if (pmd_huge(*pmd)) {
4133 if (!pmdpp)
4134 goto out;
4135
4136 if (range) {
4137 mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0,
4138 NULL, mm, address & PMD_MASK,
4139 (address & PMD_MASK) + PMD_SIZE);
4140 mmu_notifier_invalidate_range_start(range);
4141 }
4142 *ptlp = pmd_lock(mm, pmd);
4143 if (pmd_huge(*pmd)) {
4144 *pmdpp = pmd;
4145 return 0;
4146 }
4147 spin_unlock(*ptlp);
4148 if (range)
4149 mmu_notifier_invalidate_range_end(range);
4150 }
4151
4152 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
4153 goto out;
4154
4155 if (range) {
4156 mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
4157 address & PAGE_MASK,
4158 (address & PAGE_MASK) + PAGE_SIZE);
4159 mmu_notifier_invalidate_range_start(range);
4160 }
4161 ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
4162 if (!pte_present(*ptep))
4163 goto unlock;
4164 *ptepp = ptep;
4165 return 0;
4166unlock:
4167 pte_unmap_unlock(ptep, *ptlp);
4168 if (range)
4169 mmu_notifier_invalidate_range_end(range);
4170out:
4171 return -EINVAL;
4172}
4173
4174static inline int follow_pte(struct mm_struct *mm, unsigned long address,
4175 pte_t **ptepp, spinlock_t **ptlp)
4176{
4177 int res;
4178
4179
4180 (void) __cond_lock(*ptlp,
4181 !(res = __follow_pte_pmd(mm, address, NULL,
4182 ptepp, NULL, ptlp)));
4183 return res;
4184}
4185
4186int follow_pte_pmd(struct mm_struct *mm, unsigned long address,
4187 struct mmu_notifier_range *range,
4188 pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
4189{
4190 int res;
4191
4192
4193 (void) __cond_lock(*ptlp,
4194 !(res = __follow_pte_pmd(mm, address, range,
4195 ptepp, pmdpp, ptlp)));
4196 return res;
4197}
4198EXPORT_SYMBOL(follow_pte_pmd);
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210int follow_pfn(struct vm_area_struct *vma, unsigned long address,
4211 unsigned long *pfn)
4212{
4213 int ret = -EINVAL;
4214 spinlock_t *ptl;
4215 pte_t *ptep;
4216
4217 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
4218 return ret;
4219
4220 ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
4221 if (ret)
4222 return ret;
4223 *pfn = pte_pfn(*ptep);
4224 pte_unmap_unlock(ptep, ptl);
4225 return 0;
4226}
4227EXPORT_SYMBOL(follow_pfn);
4228
4229#ifdef CONFIG_HAVE_IOREMAP_PROT
4230int follow_phys(struct vm_area_struct *vma,
4231 unsigned long address, unsigned int flags,
4232 unsigned long *prot, resource_size_t *phys)
4233{
4234 int ret = -EINVAL;
4235 pte_t *ptep, pte;
4236 spinlock_t *ptl;
4237
4238 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
4239 goto out;
4240
4241 if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
4242 goto out;
4243 pte = *ptep;
4244
4245 if ((flags & FOLL_WRITE) && !pte_write(pte))
4246 goto unlock;
4247
4248 *prot = pgprot_val(pte_pgprot(pte));
4249 *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
4250
4251 ret = 0;
4252unlock:
4253 pte_unmap_unlock(ptep, ptl);
4254out:
4255 return ret;
4256}
4257
4258int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
4259 void *buf, int len, int write)
4260{
4261 resource_size_t phys_addr;
4262 unsigned long prot = 0;
4263 void __iomem *maddr;
4264 int offset = addr & (PAGE_SIZE-1);
4265
4266 if (follow_phys(vma, addr, write, &prot, &phys_addr))
4267 return -EINVAL;
4268
4269 maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
4270 if (!maddr)
4271 return -ENOMEM;
4272
4273 if (write)
4274 memcpy_toio(maddr + offset, buf, len);
4275 else
4276 memcpy_fromio(buf, maddr + offset, len);
4277 iounmap(maddr);
4278
4279 return len;
4280}
4281EXPORT_SYMBOL_GPL(generic_access_phys);
4282#endif
4283
4284
4285
4286
4287
4288int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
4289 unsigned long addr, void *buf, int len, unsigned int gup_flags)
4290{
4291 struct vm_area_struct *vma;
4292 void *old_buf = buf;
4293 int write = gup_flags & FOLL_WRITE;
4294
4295 if (down_read_killable(&mm->mmap_sem))
4296 return 0;
4297
4298
4299 while (len) {
4300 int bytes, ret, offset;
4301 void *maddr;
4302 struct page *page = NULL;
4303
4304 ret = get_user_pages_remote(tsk, mm, addr, 1,
4305 gup_flags, &page, &vma, NULL);
4306 if (ret <= 0) {
4307#ifndef CONFIG_HAVE_IOREMAP_PROT
4308 break;
4309#else
4310
4311
4312
4313
4314 vma = find_vma(mm, addr);
4315 if (!vma || vma->vm_start > addr)
4316 break;
4317 if (vma->vm_ops && vma->vm_ops->access)
4318 ret = vma->vm_ops->access(vma, addr, buf,
4319 len, write);
4320 if (ret <= 0)
4321 break;
4322 bytes = ret;
4323#endif
4324 } else {
4325 bytes = len;
4326 offset = addr & (PAGE_SIZE-1);
4327 if (bytes > PAGE_SIZE-offset)
4328 bytes = PAGE_SIZE-offset;
4329
4330 maddr = kmap(page);
4331 if (write) {
4332 copy_to_user_page(vma, page, addr,
4333 maddr + offset, buf, bytes);
4334 set_page_dirty_lock(page);
4335 } else {
4336 copy_from_user_page(vma, page, addr,
4337 buf, maddr + offset, bytes);
4338 }
4339 kunmap(page);
4340 put_page(page);
4341 }
4342 len -= bytes;
4343 buf += bytes;
4344 addr += bytes;
4345 }
4346 up_read(&mm->mmap_sem);
4347
4348 return buf - old_buf;
4349}
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363int access_remote_vm(struct mm_struct *mm, unsigned long addr,
4364 void *buf, int len, unsigned int gup_flags)
4365{
4366 return __access_remote_vm(NULL, mm, addr, buf, len, gup_flags);
4367}
4368
4369
4370
4371
4372
4373
4374int access_process_vm(struct task_struct *tsk, unsigned long addr,
4375 void *buf, int len, unsigned int gup_flags)
4376{
4377 struct mm_struct *mm;
4378 int ret;
4379
4380 mm = get_task_mm(tsk);
4381 if (!mm)
4382 return 0;
4383
4384 ret = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags);
4385
4386 mmput(mm);
4387
4388 return ret;
4389}
4390EXPORT_SYMBOL_GPL(access_process_vm);
4391
4392
4393
4394
4395void print_vma_addr(char *prefix, unsigned long ip)
4396{
4397 struct mm_struct *mm = current->mm;
4398 struct vm_area_struct *vma;
4399
4400
4401
4402
4403 if (!down_read_trylock(&mm->mmap_sem))
4404 return;
4405
4406 vma = find_vma(mm, ip);
4407 if (vma && vma->vm_file) {
4408 struct file *f = vma->vm_file;
4409 char *buf = (char *)__get_free_page(GFP_NOWAIT);
4410 if (buf) {
4411 char *p;
4412
4413 p = file_path(f, buf, PAGE_SIZE);
4414 if (IS_ERR(p))
4415 p = "?";
4416 printk("%s%s[%lx+%lx]", prefix, kbasename(p),
4417 vma->vm_start,
4418 vma->vm_end - vma->vm_start);
4419 free_page((unsigned long)buf);
4420 }
4421 }
4422 up_read(&mm->mmap_sem);
4423}
4424
4425#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
4426void __might_fault(const char *file, int line)
4427{
4428
4429
4430
4431
4432
4433
4434 if (uaccess_kernel())
4435 return;
4436 if (pagefault_disabled())
4437 return;
4438 __might_sleep(file, line, 0);
4439#if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
4440 if (current->mm)
4441 might_lock_read(¤t->mm->mmap_sem);
4442#endif
4443}
4444EXPORT_SYMBOL(__might_fault);
4445#endif
4446
4447#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
4448
4449
4450
4451
4452
4453static inline void process_huge_page(
4454 unsigned long addr_hint, unsigned int pages_per_huge_page,
4455 void (*process_subpage)(unsigned long addr, int idx, void *arg),
4456 void *arg)
4457{
4458 int i, n, base, l;
4459 unsigned long addr = addr_hint &
4460 ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
4461
4462
4463 might_sleep();
4464 n = (addr_hint - addr) / PAGE_SIZE;
4465 if (2 * n <= pages_per_huge_page) {
4466
4467 base = 0;
4468 l = n;
4469
4470 for (i = pages_per_huge_page - 1; i >= 2 * n; i--) {
4471 cond_resched();
4472 process_subpage(addr + i * PAGE_SIZE, i, arg);
4473 }
4474 } else {
4475
4476 base = pages_per_huge_page - 2 * (pages_per_huge_page - n);
4477 l = pages_per_huge_page - n;
4478
4479 for (i = 0; i < base; i++) {
4480 cond_resched();
4481 process_subpage(addr + i * PAGE_SIZE, i, arg);
4482 }
4483 }
4484
4485
4486
4487
4488 for (i = 0; i < l; i++) {
4489 int left_idx = base + i;
4490 int right_idx = base + 2 * l - 1 - i;
4491
4492 cond_resched();
4493 process_subpage(addr + left_idx * PAGE_SIZE, left_idx, arg);
4494 cond_resched();
4495 process_subpage(addr + right_idx * PAGE_SIZE, right_idx, arg);
4496 }
4497}
4498
4499static void clear_gigantic_page(struct page *page,
4500 unsigned long addr,
4501 unsigned int pages_per_huge_page)
4502{
4503 int i;
4504 struct page *p = page;
4505
4506 might_sleep();
4507 for (i = 0; i < pages_per_huge_page;
4508 i++, p = mem_map_next(p, page, i)) {
4509 cond_resched();
4510 clear_user_highpage(p, addr + i * PAGE_SIZE);
4511 }
4512}
4513
4514static void clear_subpage(unsigned long addr, int idx, void *arg)
4515{
4516 struct page *page = arg;
4517
4518 clear_user_highpage(page + idx, addr);
4519}
4520
4521void clear_huge_page(struct page *page,
4522 unsigned long addr_hint, unsigned int pages_per_huge_page)
4523{
4524 unsigned long addr = addr_hint &
4525 ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
4526
4527 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
4528 clear_gigantic_page(page, addr, pages_per_huge_page);
4529 return;
4530 }
4531
4532 process_huge_page(addr_hint, pages_per_huge_page, clear_subpage, page);
4533}
4534
4535static void copy_user_gigantic_page(struct page *dst, struct page *src,
4536 unsigned long addr,
4537 struct vm_area_struct *vma,
4538 unsigned int pages_per_huge_page)
4539{
4540 int i;
4541 struct page *dst_base = dst;
4542 struct page *src_base = src;
4543
4544 for (i = 0; i < pages_per_huge_page; ) {
4545 cond_resched();
4546 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
4547
4548 i++;
4549 dst = mem_map_next(dst, dst_base, i);
4550 src = mem_map_next(src, src_base, i);
4551 }
4552}
4553
4554struct copy_subpage_arg {
4555 struct page *dst;
4556 struct page *src;
4557 struct vm_area_struct *vma;
4558};
4559
4560static void copy_subpage(unsigned long addr, int idx, void *arg)
4561{
4562 struct copy_subpage_arg *copy_arg = arg;
4563
4564 copy_user_highpage(copy_arg->dst + idx, copy_arg->src + idx,
4565 addr, copy_arg->vma);
4566}
4567
4568void copy_user_huge_page(struct page *dst, struct page *src,
4569 unsigned long addr_hint, struct vm_area_struct *vma,
4570 unsigned int pages_per_huge_page)
4571{
4572 unsigned long addr = addr_hint &
4573 ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
4574 struct copy_subpage_arg arg = {
4575 .dst = dst,
4576 .src = src,
4577 .vma = vma,
4578 };
4579
4580 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
4581 copy_user_gigantic_page(dst, src, addr, vma,
4582 pages_per_huge_page);
4583 return;
4584 }
4585
4586 process_huge_page(addr_hint, pages_per_huge_page, copy_subpage, &arg);
4587}
4588
4589long copy_huge_page_from_user(struct page *dst_page,
4590 const void __user *usr_src,
4591 unsigned int pages_per_huge_page,
4592 bool allow_pagefault)
4593{
4594 void *src = (void *)usr_src;
4595 void *page_kaddr;
4596 unsigned long i, rc = 0;
4597 unsigned long ret_val = pages_per_huge_page * PAGE_SIZE;
4598
4599 for (i = 0; i < pages_per_huge_page; i++) {
4600 if (allow_pagefault)
4601 page_kaddr = kmap(dst_page + i);
4602 else
4603 page_kaddr = kmap_atomic(dst_page + i);
4604 rc = copy_from_user(page_kaddr,
4605 (const void __user *)(src + i * PAGE_SIZE),
4606 PAGE_SIZE);
4607 if (allow_pagefault)
4608 kunmap(dst_page + i);
4609 else
4610 kunmap_atomic(page_kaddr);
4611
4612 ret_val -= (PAGE_SIZE - rc);
4613 if (rc)
4614 break;
4615
4616 cond_resched();
4617 }
4618 return ret_val;
4619}
4620#endif
4621
4622#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS
4623
4624static struct kmem_cache *page_ptl_cachep;
4625
4626void __init ptlock_cache_init(void)
4627{
4628 page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0,
4629 SLAB_PANIC, NULL);
4630}
4631
4632bool ptlock_alloc(struct page *page)
4633{
4634 spinlock_t *ptl;
4635
4636 ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
4637 if (!ptl)
4638 return false;
4639 page->ptl = ptl;
4640 return true;
4641}
4642
4643void ptlock_free(struct page *page)
4644{
4645 kmem_cache_free(page_ptl_cachep, page->ptl);
4646}
4647#endif
4648