1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41#include <linux/kernel_stat.h>
42#include <linux/mm.h>
43#include <linux/sched/mm.h>
44#include <linux/sched/coredump.h>
45#include <linux/sched/numa_balancing.h>
46#include <linux/sched/task.h>
47#include <linux/hugetlb.h>
48#include <linux/mman.h>
49#include <linux/swap.h>
50#include <linux/highmem.h>
51#include <linux/pagemap.h>
52#include <linux/memremap.h>
53#include <linux/ksm.h>
54#include <linux/rmap.h>
55#include <linux/export.h>
56#include <linux/delayacct.h>
57#include <linux/init.h>
58#include <linux/pfn_t.h>
59#include <linux/writeback.h>
60#include <linux/memcontrol.h>
61#include <linux/mmu_notifier.h>
62#include <linux/swapops.h>
63#include <linux/elf.h>
64#include <linux/gfp.h>
65#include <linux/migrate.h>
66#include <linux/string.h>
67#include <linux/dma-debug.h>
68#include <linux/debugfs.h>
69#include <linux/userfaultfd_k.h>
70#include <linux/dax.h>
71#include <linux/oom.h>
72
73#include <asm/io.h>
74#include <asm/mmu_context.h>
75#include <asm/pgalloc.h>
76#include <linux/uaccess.h>
77#include <asm/tlb.h>
78#include <asm/tlbflush.h>
79#include <asm/pgtable.h>
80
81#include "internal.h"
82
83#if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST)
84#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
85#endif
86
87#ifndef CONFIG_NEED_MULTIPLE_NODES
88
89unsigned long max_mapnr;
90EXPORT_SYMBOL(max_mapnr);
91
92struct page *mem_map;
93EXPORT_SYMBOL(mem_map);
94#endif
95
96
97
98
99
100
101
102
103void *high_memory;
104EXPORT_SYMBOL(high_memory);
105
106
107
108
109
110
111
112int randomize_va_space __read_mostly =
113#ifdef CONFIG_COMPAT_BRK
114 1;
115#else
116 2;
117#endif
118
119static int __init disable_randmaps(char *s)
120{
121 randomize_va_space = 0;
122 return 1;
123}
124__setup("norandmaps", disable_randmaps);
125
126unsigned long zero_pfn __read_mostly;
127EXPORT_SYMBOL(zero_pfn);
128
129unsigned long highest_memmap_pfn __read_mostly;
130
131
132
133
134static int __init init_zero_pfn(void)
135{
136 zero_pfn = page_to_pfn(ZERO_PAGE(0));
137 return 0;
138}
139core_initcall(init_zero_pfn);
140
141
142#if defined(SPLIT_RSS_COUNTING)
143
144void sync_mm_rss(struct mm_struct *mm)
145{
146 int i;
147
148 for (i = 0; i < NR_MM_COUNTERS; i++) {
149 if (current->rss_stat.count[i]) {
150 add_mm_counter(mm, i, current->rss_stat.count[i]);
151 current->rss_stat.count[i] = 0;
152 }
153 }
154 current->rss_stat.events = 0;
155}
156
157static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
158{
159 struct task_struct *task = current;
160
161 if (likely(task->mm == mm))
162 task->rss_stat.count[member] += val;
163 else
164 add_mm_counter(mm, member, val);
165}
166#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
167#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
168
169
170#define TASK_RSS_EVENTS_THRESH (64)
171static void check_sync_rss_stat(struct task_struct *task)
172{
173 if (unlikely(task != current))
174 return;
175 if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
176 sync_mm_rss(task->mm);
177}
178#else
179
180#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
181#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
182
183static void check_sync_rss_stat(struct task_struct *task)
184{
185}
186
187#endif
188
189
190
191
192
193static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
194 unsigned long addr)
195{
196 pgtable_t token = pmd_pgtable(*pmd);
197 pmd_clear(pmd);
198 pte_free_tlb(tlb, token, addr);
199 mm_dec_nr_ptes(tlb->mm);
200}
201
202static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
203 unsigned long addr, unsigned long end,
204 unsigned long floor, unsigned long ceiling)
205{
206 pmd_t *pmd;
207 unsigned long next;
208 unsigned long start;
209
210 start = addr;
211 pmd = pmd_offset(pud, addr);
212 do {
213 next = pmd_addr_end(addr, end);
214 if (pmd_none_or_clear_bad(pmd))
215 continue;
216 free_pte_range(tlb, pmd, addr);
217 } while (pmd++, addr = next, addr != end);
218
219 start &= PUD_MASK;
220 if (start < floor)
221 return;
222 if (ceiling) {
223 ceiling &= PUD_MASK;
224 if (!ceiling)
225 return;
226 }
227 if (end - 1 > ceiling - 1)
228 return;
229
230 pmd = pmd_offset(pud, start);
231 pud_clear(pud);
232 pmd_free_tlb(tlb, pmd, start);
233 mm_dec_nr_pmds(tlb->mm);
234}
235
236static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
237 unsigned long addr, unsigned long end,
238 unsigned long floor, unsigned long ceiling)
239{
240 pud_t *pud;
241 unsigned long next;
242 unsigned long start;
243
244 start = addr;
245 pud = pud_offset(p4d, addr);
246 do {
247 next = pud_addr_end(addr, end);
248 if (pud_none_or_clear_bad(pud))
249 continue;
250 free_pmd_range(tlb, pud, addr, next, floor, ceiling);
251 } while (pud++, addr = next, addr != end);
252
253 start &= P4D_MASK;
254 if (start < floor)
255 return;
256 if (ceiling) {
257 ceiling &= P4D_MASK;
258 if (!ceiling)
259 return;
260 }
261 if (end - 1 > ceiling - 1)
262 return;
263
264 pud = pud_offset(p4d, start);
265 p4d_clear(p4d);
266 pud_free_tlb(tlb, pud, start);
267 mm_dec_nr_puds(tlb->mm);
268}
269
270static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
271 unsigned long addr, unsigned long end,
272 unsigned long floor, unsigned long ceiling)
273{
274 p4d_t *p4d;
275 unsigned long next;
276 unsigned long start;
277
278 start = addr;
279 p4d = p4d_offset(pgd, addr);
280 do {
281 next = p4d_addr_end(addr, end);
282 if (p4d_none_or_clear_bad(p4d))
283 continue;
284 free_pud_range(tlb, p4d, addr, next, floor, ceiling);
285 } while (p4d++, addr = next, addr != end);
286
287 start &= PGDIR_MASK;
288 if (start < floor)
289 return;
290 if (ceiling) {
291 ceiling &= PGDIR_MASK;
292 if (!ceiling)
293 return;
294 }
295 if (end - 1 > ceiling - 1)
296 return;
297
298 p4d = p4d_offset(pgd, start);
299 pgd_clear(pgd);
300 p4d_free_tlb(tlb, p4d, start);
301}
302
303
304
305
306void free_pgd_range(struct mmu_gather *tlb,
307 unsigned long addr, unsigned long end,
308 unsigned long floor, unsigned long ceiling)
309{
310 pgd_t *pgd;
311 unsigned long next;
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339 addr &= PMD_MASK;
340 if (addr < floor) {
341 addr += PMD_SIZE;
342 if (!addr)
343 return;
344 }
345 if (ceiling) {
346 ceiling &= PMD_MASK;
347 if (!ceiling)
348 return;
349 }
350 if (end - 1 > ceiling - 1)
351 end -= PMD_SIZE;
352 if (addr > end - 1)
353 return;
354
355
356
357
358 tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
359 pgd = pgd_offset(tlb->mm, addr);
360 do {
361 next = pgd_addr_end(addr, end);
362 if (pgd_none_or_clear_bad(pgd))
363 continue;
364 free_p4d_range(tlb, pgd, addr, next, floor, ceiling);
365 } while (pgd++, addr = next, addr != end);
366}
367
368void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
369 unsigned long floor, unsigned long ceiling)
370{
371 while (vma) {
372 struct vm_area_struct *next = vma->vm_next;
373 unsigned long addr = vma->vm_start;
374
375
376
377
378
379 unlink_anon_vmas(vma);
380 unlink_file_vma(vma);
381
382 if (is_vm_hugetlb_page(vma)) {
383 hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
384 floor, next ? next->vm_start : ceiling);
385 } else {
386
387
388
389 while (next && next->vm_start <= vma->vm_end + PMD_SIZE
390 && !is_vm_hugetlb_page(next)) {
391 vma = next;
392 next = vma->vm_next;
393 unlink_anon_vmas(vma);
394 unlink_file_vma(vma);
395 }
396 free_pgd_range(tlb, addr, vma->vm_end,
397 floor, next ? next->vm_start : ceiling);
398 }
399 vma = next;
400 }
401}
402
403int __pte_alloc(struct mm_struct *mm, pmd_t *pmd)
404{
405 spinlock_t *ptl;
406 pgtable_t new = pte_alloc_one(mm);
407 if (!new)
408 return -ENOMEM;
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423 smp_wmb();
424
425 ptl = pmd_lock(mm, pmd);
426 if (likely(pmd_none(*pmd))) {
427 mm_inc_nr_ptes(mm);
428 pmd_populate(mm, pmd, new);
429 new = NULL;
430 }
431 spin_unlock(ptl);
432 if (new)
433 pte_free(mm, new);
434 return 0;
435}
436
437int __pte_alloc_kernel(pmd_t *pmd)
438{
439 pte_t *new = pte_alloc_one_kernel(&init_mm);
440 if (!new)
441 return -ENOMEM;
442
443 smp_wmb();
444
445 spin_lock(&init_mm.page_table_lock);
446 if (likely(pmd_none(*pmd))) {
447 pmd_populate_kernel(&init_mm, pmd, new);
448 new = NULL;
449 }
450 spin_unlock(&init_mm.page_table_lock);
451 if (new)
452 pte_free_kernel(&init_mm, new);
453 return 0;
454}
455
456static inline void init_rss_vec(int *rss)
457{
458 memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
459}
460
461static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
462{
463 int i;
464
465 if (current->mm == mm)
466 sync_mm_rss(mm);
467 for (i = 0; i < NR_MM_COUNTERS; i++)
468 if (rss[i])
469 add_mm_counter(mm, i, rss[i]);
470}
471
472
473
474
475
476
477
478
479static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
480 pte_t pte, struct page *page)
481{
482 pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
483 p4d_t *p4d = p4d_offset(pgd, addr);
484 pud_t *pud = pud_offset(p4d, addr);
485 pmd_t *pmd = pmd_offset(pud, addr);
486 struct address_space *mapping;
487 pgoff_t index;
488 static unsigned long resume;
489 static unsigned long nr_shown;
490 static unsigned long nr_unshown;
491
492
493
494
495
496 if (nr_shown == 60) {
497 if (time_before(jiffies, resume)) {
498 nr_unshown++;
499 return;
500 }
501 if (nr_unshown) {
502 pr_alert("BUG: Bad page map: %lu messages suppressed\n",
503 nr_unshown);
504 nr_unshown = 0;
505 }
506 nr_shown = 0;
507 }
508 if (nr_shown++ == 0)
509 resume = jiffies + 60 * HZ;
510
511 mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
512 index = linear_page_index(vma, addr);
513
514 pr_alert("BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
515 current->comm,
516 (long long)pte_val(pte), (long long)pmd_val(*pmd));
517 if (page)
518 dump_page(page, "bad pte");
519 pr_alert("addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
520 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
521 pr_alert("file:%pD fault:%pf mmap:%pf readpage:%pf\n",
522 vma->vm_file,
523 vma->vm_ops ? vma->vm_ops->fault : NULL,
524 vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
525 mapping ? mapping->a_ops->readpage : NULL);
526 dump_stack();
527 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
528}
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
573 pte_t pte, bool with_public_device)
574{
575 unsigned long pfn = pte_pfn(pte);
576
577 if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) {
578 if (likely(!pte_special(pte)))
579 goto check_pfn;
580 if (vma->vm_ops && vma->vm_ops->find_special_page)
581 return vma->vm_ops->find_special_page(vma, addr);
582 if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
583 return NULL;
584 if (is_zero_pfn(pfn))
585 return NULL;
586
587
588
589
590
591
592
593
594
595
596
597
598
599 if (likely(pfn <= highest_memmap_pfn)) {
600 struct page *page = pfn_to_page(pfn);
601
602 if (is_device_public_page(page)) {
603 if (with_public_device)
604 return page;
605 return NULL;
606 }
607 }
608
609 if (pte_devmap(pte))
610 return NULL;
611
612 print_bad_pte(vma, addr, pte, NULL);
613 return NULL;
614 }
615
616
617
618 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
619 if (vma->vm_flags & VM_MIXEDMAP) {
620 if (!pfn_valid(pfn))
621 return NULL;
622 goto out;
623 } else {
624 unsigned long off;
625 off = (addr - vma->vm_start) >> PAGE_SHIFT;
626 if (pfn == vma->vm_pgoff + off)
627 return NULL;
628 if (!is_cow_mapping(vma->vm_flags))
629 return NULL;
630 }
631 }
632
633 if (is_zero_pfn(pfn))
634 return NULL;
635
636check_pfn:
637 if (unlikely(pfn > highest_memmap_pfn)) {
638 print_bad_pte(vma, addr, pte, NULL);
639 return NULL;
640 }
641
642
643
644
645
646out:
647 return pfn_to_page(pfn);
648}
649
650#ifdef CONFIG_TRANSPARENT_HUGEPAGE
651struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
652 pmd_t pmd)
653{
654 unsigned long pfn = pmd_pfn(pmd);
655
656
657
658
659
660
661 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
662 if (vma->vm_flags & VM_MIXEDMAP) {
663 if (!pfn_valid(pfn))
664 return NULL;
665 goto out;
666 } else {
667 unsigned long off;
668 off = (addr - vma->vm_start) >> PAGE_SHIFT;
669 if (pfn == vma->vm_pgoff + off)
670 return NULL;
671 if (!is_cow_mapping(vma->vm_flags))
672 return NULL;
673 }
674 }
675
676 if (pmd_devmap(pmd))
677 return NULL;
678 if (is_zero_pfn(pfn))
679 return NULL;
680 if (unlikely(pfn > highest_memmap_pfn))
681 return NULL;
682
683
684
685
686
687out:
688 return pfn_to_page(pfn);
689}
690#endif
691
692
693
694
695
696
697
698static inline unsigned long
699copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
700 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
701 unsigned long addr, int *rss)
702{
703 unsigned long vm_flags = vma->vm_flags;
704 pte_t pte = *src_pte;
705 struct page *page;
706
707
708 if (unlikely(!pte_present(pte))) {
709 swp_entry_t entry = pte_to_swp_entry(pte);
710
711 if (likely(!non_swap_entry(entry))) {
712 if (swap_duplicate(entry) < 0)
713 return entry.val;
714
715
716 if (unlikely(list_empty(&dst_mm->mmlist))) {
717 spin_lock(&mmlist_lock);
718 if (list_empty(&dst_mm->mmlist))
719 list_add(&dst_mm->mmlist,
720 &src_mm->mmlist);
721 spin_unlock(&mmlist_lock);
722 }
723 rss[MM_SWAPENTS]++;
724 } else if (is_migration_entry(entry)) {
725 page = migration_entry_to_page(entry);
726
727 rss[mm_counter(page)]++;
728
729 if (is_write_migration_entry(entry) &&
730 is_cow_mapping(vm_flags)) {
731
732
733
734
735 make_migration_entry_read(&entry);
736 pte = swp_entry_to_pte(entry);
737 if (pte_swp_soft_dirty(*src_pte))
738 pte = pte_swp_mksoft_dirty(pte);
739 set_pte_at(src_mm, addr, src_pte, pte);
740 }
741 } else if (is_device_private_entry(entry)) {
742 page = device_private_entry_to_page(entry);
743
744
745
746
747
748
749
750
751
752
753 get_page(page);
754 rss[mm_counter(page)]++;
755 page_dup_rmap(page, false);
756
757
758
759
760
761
762
763
764 if (is_write_device_private_entry(entry) &&
765 is_cow_mapping(vm_flags)) {
766 make_device_private_entry_read(&entry);
767 pte = swp_entry_to_pte(entry);
768 set_pte_at(src_mm, addr, src_pte, pte);
769 }
770 }
771 goto out_set_pte;
772 }
773
774
775
776
777
778 if (is_cow_mapping(vm_flags) && pte_write(pte)) {
779 ptep_set_wrprotect(src_mm, addr, src_pte);
780 pte = pte_wrprotect(pte);
781 }
782
783
784
785
786
787 if (vm_flags & VM_SHARED)
788 pte = pte_mkclean(pte);
789 pte = pte_mkold(pte);
790
791 page = vm_normal_page(vma, addr, pte);
792 if (page) {
793 get_page(page);
794 page_dup_rmap(page, false);
795 rss[mm_counter(page)]++;
796 } else if (pte_devmap(pte)) {
797 page = pte_page(pte);
798
799
800
801
802
803
804 if (is_device_public_page(page)) {
805 get_page(page);
806 page_dup_rmap(page, false);
807 rss[mm_counter(page)]++;
808 }
809 }
810
811out_set_pte:
812 set_pte_at(dst_mm, addr, dst_pte, pte);
813 return 0;
814}
815
816static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
817 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
818 unsigned long addr, unsigned long end)
819{
820 pte_t *orig_src_pte, *orig_dst_pte;
821 pte_t *src_pte, *dst_pte;
822 spinlock_t *src_ptl, *dst_ptl;
823 int progress = 0;
824 int rss[NR_MM_COUNTERS];
825 swp_entry_t entry = (swp_entry_t){0};
826
827again:
828 init_rss_vec(rss);
829
830 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
831 if (!dst_pte)
832 return -ENOMEM;
833 src_pte = pte_offset_map(src_pmd, addr);
834 src_ptl = pte_lockptr(src_mm, src_pmd);
835 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
836 orig_src_pte = src_pte;
837 orig_dst_pte = dst_pte;
838 arch_enter_lazy_mmu_mode();
839
840 do {
841
842
843
844
845 if (progress >= 32) {
846 progress = 0;
847 if (need_resched() ||
848 spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
849 break;
850 }
851 if (pte_none(*src_pte)) {
852 progress++;
853 continue;
854 }
855 entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
856 vma, addr, rss);
857 if (entry.val)
858 break;
859 progress += 8;
860 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
861
862 arch_leave_lazy_mmu_mode();
863 spin_unlock(src_ptl);
864 pte_unmap(orig_src_pte);
865 add_mm_rss_vec(dst_mm, rss);
866 pte_unmap_unlock(orig_dst_pte, dst_ptl);
867 cond_resched();
868
869 if (entry.val) {
870 if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
871 return -ENOMEM;
872 progress = 0;
873 }
874 if (addr != end)
875 goto again;
876 return 0;
877}
878
879static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
880 pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
881 unsigned long addr, unsigned long end)
882{
883 pmd_t *src_pmd, *dst_pmd;
884 unsigned long next;
885
886 dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
887 if (!dst_pmd)
888 return -ENOMEM;
889 src_pmd = pmd_offset(src_pud, addr);
890 do {
891 next = pmd_addr_end(addr, end);
892 if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)
893 || pmd_devmap(*src_pmd)) {
894 int err;
895 VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, vma);
896 err = copy_huge_pmd(dst_mm, src_mm,
897 dst_pmd, src_pmd, addr, vma);
898 if (err == -ENOMEM)
899 return -ENOMEM;
900 if (!err)
901 continue;
902
903 }
904 if (pmd_none_or_clear_bad(src_pmd))
905 continue;
906 if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
907 vma, addr, next))
908 return -ENOMEM;
909 } while (dst_pmd++, src_pmd++, addr = next, addr != end);
910 return 0;
911}
912
913static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
914 p4d_t *dst_p4d, p4d_t *src_p4d, struct vm_area_struct *vma,
915 unsigned long addr, unsigned long end)
916{
917 pud_t *src_pud, *dst_pud;
918 unsigned long next;
919
920 dst_pud = pud_alloc(dst_mm, dst_p4d, addr);
921 if (!dst_pud)
922 return -ENOMEM;
923 src_pud = pud_offset(src_p4d, addr);
924 do {
925 next = pud_addr_end(addr, end);
926 if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) {
927 int err;
928
929 VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, vma);
930 err = copy_huge_pud(dst_mm, src_mm,
931 dst_pud, src_pud, addr, vma);
932 if (err == -ENOMEM)
933 return -ENOMEM;
934 if (!err)
935 continue;
936
937 }
938 if (pud_none_or_clear_bad(src_pud))
939 continue;
940 if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
941 vma, addr, next))
942 return -ENOMEM;
943 } while (dst_pud++, src_pud++, addr = next, addr != end);
944 return 0;
945}
946
947static inline int copy_p4d_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
948 pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
949 unsigned long addr, unsigned long end)
950{
951 p4d_t *src_p4d, *dst_p4d;
952 unsigned long next;
953
954 dst_p4d = p4d_alloc(dst_mm, dst_pgd, addr);
955 if (!dst_p4d)
956 return -ENOMEM;
957 src_p4d = p4d_offset(src_pgd, addr);
958 do {
959 next = p4d_addr_end(addr, end);
960 if (p4d_none_or_clear_bad(src_p4d))
961 continue;
962 if (copy_pud_range(dst_mm, src_mm, dst_p4d, src_p4d,
963 vma, addr, next))
964 return -ENOMEM;
965 } while (dst_p4d++, src_p4d++, addr = next, addr != end);
966 return 0;
967}
968
969int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
970 struct vm_area_struct *vma)
971{
972 pgd_t *src_pgd, *dst_pgd;
973 unsigned long next;
974 unsigned long addr = vma->vm_start;
975 unsigned long end = vma->vm_end;
976 struct mmu_notifier_range range;
977 bool is_cow;
978 int ret;
979
980
981
982
983
984
985
986 if (!(vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) &&
987 !vma->anon_vma)
988 return 0;
989
990 if (is_vm_hugetlb_page(vma))
991 return copy_hugetlb_page_range(dst_mm, src_mm, vma);
992
993 if (unlikely(vma->vm_flags & VM_PFNMAP)) {
994
995
996
997
998 ret = track_pfn_copy(vma);
999 if (ret)
1000 return ret;
1001 }
1002
1003
1004
1005
1006
1007
1008
1009 is_cow = is_cow_mapping(vma->vm_flags);
1010
1011 if (is_cow) {
1012 mmu_notifier_range_init(&range, src_mm, addr, end);
1013 mmu_notifier_invalidate_range_start(&range);
1014 }
1015
1016 ret = 0;
1017 dst_pgd = pgd_offset(dst_mm, addr);
1018 src_pgd = pgd_offset(src_mm, addr);
1019 do {
1020 next = pgd_addr_end(addr, end);
1021 if (pgd_none_or_clear_bad(src_pgd))
1022 continue;
1023 if (unlikely(copy_p4d_range(dst_mm, src_mm, dst_pgd, src_pgd,
1024 vma, addr, next))) {
1025 ret = -ENOMEM;
1026 break;
1027 }
1028 } while (dst_pgd++, src_pgd++, addr = next, addr != end);
1029
1030 if (is_cow)
1031 mmu_notifier_invalidate_range_end(&range);
1032 return ret;
1033}
1034
1035static unsigned long zap_pte_range(struct mmu_gather *tlb,
1036 struct vm_area_struct *vma, pmd_t *pmd,
1037 unsigned long addr, unsigned long end,
1038 struct zap_details *details)
1039{
1040 struct mm_struct *mm = tlb->mm;
1041 int force_flush = 0;
1042 int rss[NR_MM_COUNTERS];
1043 spinlock_t *ptl;
1044 pte_t *start_pte;
1045 pte_t *pte;
1046 swp_entry_t entry;
1047
1048 tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
1049again:
1050 init_rss_vec(rss);
1051 start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
1052 pte = start_pte;
1053 flush_tlb_batched_pending(mm);
1054 arch_enter_lazy_mmu_mode();
1055 do {
1056 pte_t ptent = *pte;
1057 if (pte_none(ptent))
1058 continue;
1059
1060 if (pte_present(ptent)) {
1061 struct page *page;
1062
1063 page = _vm_normal_page(vma, addr, ptent, true);
1064 if (unlikely(details) && page) {
1065
1066
1067
1068
1069
1070 if (details->check_mapping &&
1071 details->check_mapping != page_rmapping(page))
1072 continue;
1073 }
1074 ptent = ptep_get_and_clear_full(mm, addr, pte,
1075 tlb->fullmm);
1076 tlb_remove_tlb_entry(tlb, pte, addr);
1077 if (unlikely(!page))
1078 continue;
1079
1080 if (!PageAnon(page)) {
1081 if (pte_dirty(ptent)) {
1082 force_flush = 1;
1083 set_page_dirty(page);
1084 }
1085 if (pte_young(ptent) &&
1086 likely(!(vma->vm_flags & VM_SEQ_READ)))
1087 mark_page_accessed(page);
1088 }
1089 rss[mm_counter(page)]--;
1090 page_remove_rmap(page, false);
1091 if (unlikely(page_mapcount(page) < 0))
1092 print_bad_pte(vma, addr, ptent, page);
1093 if (unlikely(__tlb_remove_page(tlb, page))) {
1094 force_flush = 1;
1095 addr += PAGE_SIZE;
1096 break;
1097 }
1098 continue;
1099 }
1100
1101 entry = pte_to_swp_entry(ptent);
1102 if (non_swap_entry(entry) && is_device_private_entry(entry)) {
1103 struct page *page = device_private_entry_to_page(entry);
1104
1105 if (unlikely(details && details->check_mapping)) {
1106
1107
1108
1109
1110
1111 if (details->check_mapping !=
1112 page_rmapping(page))
1113 continue;
1114 }
1115
1116 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
1117 rss[mm_counter(page)]--;
1118 page_remove_rmap(page, false);
1119 put_page(page);
1120 continue;
1121 }
1122
1123
1124 if (unlikely(details))
1125 continue;
1126
1127 entry = pte_to_swp_entry(ptent);
1128 if (!non_swap_entry(entry))
1129 rss[MM_SWAPENTS]--;
1130 else if (is_migration_entry(entry)) {
1131 struct page *page;
1132
1133 page = migration_entry_to_page(entry);
1134 rss[mm_counter(page)]--;
1135 }
1136 if (unlikely(!free_swap_and_cache(entry)))
1137 print_bad_pte(vma, addr, ptent, NULL);
1138 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
1139 } while (pte++, addr += PAGE_SIZE, addr != end);
1140
1141 add_mm_rss_vec(mm, rss);
1142 arch_leave_lazy_mmu_mode();
1143
1144
1145 if (force_flush)
1146 tlb_flush_mmu_tlbonly(tlb);
1147 pte_unmap_unlock(start_pte, ptl);
1148
1149
1150
1151
1152
1153
1154
1155 if (force_flush) {
1156 force_flush = 0;
1157 tlb_flush_mmu_free(tlb);
1158 if (addr != end)
1159 goto again;
1160 }
1161
1162 return addr;
1163}
1164
1165static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1166 struct vm_area_struct *vma, pud_t *pud,
1167 unsigned long addr, unsigned long end,
1168 struct zap_details *details)
1169{
1170 pmd_t *pmd;
1171 unsigned long next;
1172
1173 pmd = pmd_offset(pud, addr);
1174 do {
1175 next = pmd_addr_end(addr, end);
1176 if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
1177 if (next - addr != HPAGE_PMD_SIZE)
1178 __split_huge_pmd(vma, pmd, addr, false, NULL);
1179 else if (zap_huge_pmd(tlb, vma, pmd, addr))
1180 goto next;
1181
1182 }
1183
1184
1185
1186
1187
1188
1189
1190 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
1191 goto next;
1192 next = zap_pte_range(tlb, vma, pmd, addr, next, details);
1193next:
1194 cond_resched();
1195 } while (pmd++, addr = next, addr != end);
1196
1197 return addr;
1198}
1199
1200static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1201 struct vm_area_struct *vma, p4d_t *p4d,
1202 unsigned long addr, unsigned long end,
1203 struct zap_details *details)
1204{
1205 pud_t *pud;
1206 unsigned long next;
1207
1208 pud = pud_offset(p4d, addr);
1209 do {
1210 next = pud_addr_end(addr, end);
1211 if (pud_trans_huge(*pud) || pud_devmap(*pud)) {
1212 if (next - addr != HPAGE_PUD_SIZE) {
1213 VM_BUG_ON_VMA(!rwsem_is_locked(&tlb->mm->mmap_sem), vma);
1214 split_huge_pud(vma, pud, addr);
1215 } else if (zap_huge_pud(tlb, vma, pud, addr))
1216 goto next;
1217
1218 }
1219 if (pud_none_or_clear_bad(pud))
1220 continue;
1221 next = zap_pmd_range(tlb, vma, pud, addr, next, details);
1222next:
1223 cond_resched();
1224 } while (pud++, addr = next, addr != end);
1225
1226 return addr;
1227}
1228
1229static inline unsigned long zap_p4d_range(struct mmu_gather *tlb,
1230 struct vm_area_struct *vma, pgd_t *pgd,
1231 unsigned long addr, unsigned long end,
1232 struct zap_details *details)
1233{
1234 p4d_t *p4d;
1235 unsigned long next;
1236
1237 p4d = p4d_offset(pgd, addr);
1238 do {
1239 next = p4d_addr_end(addr, end);
1240 if (p4d_none_or_clear_bad(p4d))
1241 continue;
1242 next = zap_pud_range(tlb, vma, p4d, addr, next, details);
1243 } while (p4d++, addr = next, addr != end);
1244
1245 return addr;
1246}
1247
1248void unmap_page_range(struct mmu_gather *tlb,
1249 struct vm_area_struct *vma,
1250 unsigned long addr, unsigned long end,
1251 struct zap_details *details)
1252{
1253 pgd_t *pgd;
1254 unsigned long next;
1255
1256 BUG_ON(addr >= end);
1257 tlb_start_vma(tlb, vma);
1258 pgd = pgd_offset(vma->vm_mm, addr);
1259 do {
1260 next = pgd_addr_end(addr, end);
1261 if (pgd_none_or_clear_bad(pgd))
1262 continue;
1263 next = zap_p4d_range(tlb, vma, pgd, addr, next, details);
1264 } while (pgd++, addr = next, addr != end);
1265 tlb_end_vma(tlb, vma);
1266}
1267
1268
1269static void unmap_single_vma(struct mmu_gather *tlb,
1270 struct vm_area_struct *vma, unsigned long start_addr,
1271 unsigned long end_addr,
1272 struct zap_details *details)
1273{
1274 unsigned long start = max(vma->vm_start, start_addr);
1275 unsigned long end;
1276
1277 if (start >= vma->vm_end)
1278 return;
1279 end = min(vma->vm_end, end_addr);
1280 if (end <= vma->vm_start)
1281 return;
1282
1283 if (vma->vm_file)
1284 uprobe_munmap(vma, start, end);
1285
1286 if (unlikely(vma->vm_flags & VM_PFNMAP))
1287 untrack_pfn(vma, 0, 0);
1288
1289 if (start != end) {
1290 if (unlikely(is_vm_hugetlb_page(vma))) {
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302 if (vma->vm_file) {
1303 i_mmap_lock_write(vma->vm_file->f_mapping);
1304 __unmap_hugepage_range_final(tlb, vma, start, end, NULL);
1305 i_mmap_unlock_write(vma->vm_file->f_mapping);
1306 }
1307 } else
1308 unmap_page_range(tlb, vma, start, end, details);
1309 }
1310}
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330void unmap_vmas(struct mmu_gather *tlb,
1331 struct vm_area_struct *vma, unsigned long start_addr,
1332 unsigned long end_addr)
1333{
1334 struct mmu_notifier_range range;
1335
1336 mmu_notifier_range_init(&range, vma->vm_mm, start_addr, end_addr);
1337 mmu_notifier_invalidate_range_start(&range);
1338 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
1339 unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
1340 mmu_notifier_invalidate_range_end(&range);
1341}
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351void zap_page_range(struct vm_area_struct *vma, unsigned long start,
1352 unsigned long size)
1353{
1354 struct mmu_notifier_range range;
1355 struct mmu_gather tlb;
1356
1357 lru_add_drain();
1358 mmu_notifier_range_init(&range, vma->vm_mm, start, start + size);
1359 tlb_gather_mmu(&tlb, vma->vm_mm, start, range.end);
1360 update_hiwater_rss(vma->vm_mm);
1361 mmu_notifier_invalidate_range_start(&range);
1362 for ( ; vma && vma->vm_start < range.end; vma = vma->vm_next)
1363 unmap_single_vma(&tlb, vma, start, range.end, NULL);
1364 mmu_notifier_invalidate_range_end(&range);
1365 tlb_finish_mmu(&tlb, start, range.end);
1366}
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
1378 unsigned long size, struct zap_details *details)
1379{
1380 struct mmu_notifier_range range;
1381 struct mmu_gather tlb;
1382
1383 lru_add_drain();
1384 mmu_notifier_range_init(&range, vma->vm_mm, address, address + size);
1385 tlb_gather_mmu(&tlb, vma->vm_mm, address, range.end);
1386 update_hiwater_rss(vma->vm_mm);
1387 mmu_notifier_invalidate_range_start(&range);
1388 unmap_single_vma(&tlb, vma, address, range.end, details);
1389 mmu_notifier_invalidate_range_end(&range);
1390 tlb_finish_mmu(&tlb, address, range.end);
1391}
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
1405 unsigned long size)
1406{
1407 if (address < vma->vm_start || address + size > vma->vm_end ||
1408 !(vma->vm_flags & VM_PFNMAP))
1409 return;
1410
1411 zap_page_range_single(vma, address, size, NULL);
1412}
1413EXPORT_SYMBOL_GPL(zap_vma_ptes);
1414
1415pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
1416 spinlock_t **ptl)
1417{
1418 pgd_t *pgd;
1419 p4d_t *p4d;
1420 pud_t *pud;
1421 pmd_t *pmd;
1422
1423 pgd = pgd_offset(mm, addr);
1424 p4d = p4d_alloc(mm, pgd, addr);
1425 if (!p4d)
1426 return NULL;
1427 pud = pud_alloc(mm, p4d, addr);
1428 if (!pud)
1429 return NULL;
1430 pmd = pmd_alloc(mm, pud, addr);
1431 if (!pmd)
1432 return NULL;
1433
1434 VM_BUG_ON(pmd_trans_huge(*pmd));
1435 return pte_alloc_map_lock(mm, pmd, addr, ptl);
1436}
1437
1438
1439
1440
1441
1442
1443
1444
1445static int insert_page(struct vm_area_struct *vma, unsigned long addr,
1446 struct page *page, pgprot_t prot)
1447{
1448 struct mm_struct *mm = vma->vm_mm;
1449 int retval;
1450 pte_t *pte;
1451 spinlock_t *ptl;
1452
1453 retval = -EINVAL;
1454 if (PageAnon(page))
1455 goto out;
1456 retval = -ENOMEM;
1457 flush_dcache_page(page);
1458 pte = get_locked_pte(mm, addr, &ptl);
1459 if (!pte)
1460 goto out;
1461 retval = -EBUSY;
1462 if (!pte_none(*pte))
1463 goto out_unlock;
1464
1465
1466 get_page(page);
1467 inc_mm_counter_fast(mm, mm_counter_file(page));
1468 page_add_file_rmap(page, false);
1469 set_pte_at(mm, addr, pte, mk_pte(page, prot));
1470
1471 retval = 0;
1472 pte_unmap_unlock(pte, ptl);
1473 return retval;
1474out_unlock:
1475 pte_unmap_unlock(pte, ptl);
1476out:
1477 return retval;
1478}
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
1508 struct page *page)
1509{
1510 if (addr < vma->vm_start || addr >= vma->vm_end)
1511 return -EFAULT;
1512 if (!page_count(page))
1513 return -EINVAL;
1514 if (!(vma->vm_flags & VM_MIXEDMAP)) {
1515 BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem));
1516 BUG_ON(vma->vm_flags & VM_PFNMAP);
1517 vma->vm_flags |= VM_MIXEDMAP;
1518 }
1519 return insert_page(vma, addr, page, vma->vm_page_prot);
1520}
1521EXPORT_SYMBOL(vm_insert_page);
1522
1523static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1524 pfn_t pfn, pgprot_t prot, bool mkwrite)
1525{
1526 struct mm_struct *mm = vma->vm_mm;
1527 pte_t *pte, entry;
1528 spinlock_t *ptl;
1529
1530 pte = get_locked_pte(mm, addr, &ptl);
1531 if (!pte)
1532 return VM_FAULT_OOM;
1533 if (!pte_none(*pte)) {
1534 if (mkwrite) {
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545 if (pte_pfn(*pte) != pfn_t_to_pfn(pfn)) {
1546 WARN_ON_ONCE(!is_zero_pfn(pte_pfn(*pte)));
1547 goto out_unlock;
1548 }
1549 entry = *pte;
1550 goto out_mkwrite;
1551 } else
1552 goto out_unlock;
1553 }
1554
1555
1556 if (pfn_t_devmap(pfn))
1557 entry = pte_mkdevmap(pfn_t_pte(pfn, prot));
1558 else
1559 entry = pte_mkspecial(pfn_t_pte(pfn, prot));
1560
1561out_mkwrite:
1562 if (mkwrite) {
1563 entry = pte_mkyoung(entry);
1564 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1565 }
1566
1567 set_pte_at(mm, addr, pte, entry);
1568 update_mmu_cache(vma, addr, pte);
1569
1570out_unlock:
1571 pte_unmap_unlock(pte, ptl);
1572 return VM_FAULT_NOPAGE;
1573}
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
1594 unsigned long pfn, pgprot_t pgprot)
1595{
1596
1597
1598
1599
1600
1601
1602 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
1603 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
1604 (VM_PFNMAP|VM_MIXEDMAP));
1605 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
1606 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
1607
1608 if (addr < vma->vm_start || addr >= vma->vm_end)
1609 return VM_FAULT_SIGBUS;
1610
1611 if (!pfn_modify_allowed(pfn, pgprot))
1612 return VM_FAULT_SIGBUS;
1613
1614 track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
1615
1616 return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
1617 false);
1618}
1619EXPORT_SYMBOL(vmf_insert_pfn_prot);
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1642 unsigned long pfn)
1643{
1644 return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
1645}
1646EXPORT_SYMBOL(vmf_insert_pfn);
1647
1648static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
1649{
1650
1651 if (vma->vm_flags & VM_MIXEDMAP)
1652 return true;
1653 if (pfn_t_devmap(pfn))
1654 return true;
1655 if (pfn_t_special(pfn))
1656 return true;
1657 if (is_zero_pfn(pfn_t_to_pfn(pfn)))
1658 return true;
1659 return false;
1660}
1661
1662static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
1663 unsigned long addr, pfn_t pfn, bool mkwrite)
1664{
1665 pgprot_t pgprot = vma->vm_page_prot;
1666 int err;
1667
1668 BUG_ON(!vm_mixed_ok(vma, pfn));
1669
1670 if (addr < vma->vm_start || addr >= vma->vm_end)
1671 return VM_FAULT_SIGBUS;
1672
1673 track_pfn_insert(vma, &pgprot, pfn);
1674
1675 if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot))
1676 return VM_FAULT_SIGBUS;
1677
1678
1679
1680
1681
1682
1683
1684
1685 if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) &&
1686 !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) {
1687 struct page *page;
1688
1689
1690
1691
1692
1693
1694 page = pfn_to_page(pfn_t_to_pfn(pfn));
1695 err = insert_page(vma, addr, page, pgprot);
1696 } else {
1697 return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
1698 }
1699
1700 if (err == -ENOMEM)
1701 return VM_FAULT_OOM;
1702 if (err < 0 && err != -EBUSY)
1703 return VM_FAULT_SIGBUS;
1704
1705 return VM_FAULT_NOPAGE;
1706}
1707
1708vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
1709 pfn_t pfn)
1710{
1711 return __vm_insert_mixed(vma, addr, pfn, false);
1712}
1713EXPORT_SYMBOL(vmf_insert_mixed);
1714
1715
1716
1717
1718
1719
1720vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
1721 unsigned long addr, pfn_t pfn)
1722{
1723 return __vm_insert_mixed(vma, addr, pfn, true);
1724}
1725EXPORT_SYMBOL(vmf_insert_mixed_mkwrite);
1726
1727
1728
1729
1730
1731
1732static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1733 unsigned long addr, unsigned long end,
1734 unsigned long pfn, pgprot_t prot)
1735{
1736 pte_t *pte;
1737 spinlock_t *ptl;
1738 int err = 0;
1739
1740 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
1741 if (!pte)
1742 return -ENOMEM;
1743 arch_enter_lazy_mmu_mode();
1744 do {
1745 BUG_ON(!pte_none(*pte));
1746 if (!pfn_modify_allowed(pfn, prot)) {
1747 err = -EACCES;
1748 break;
1749 }
1750 set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
1751 pfn++;
1752 } while (pte++, addr += PAGE_SIZE, addr != end);
1753 arch_leave_lazy_mmu_mode();
1754 pte_unmap_unlock(pte - 1, ptl);
1755 return err;
1756}
1757
1758static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
1759 unsigned long addr, unsigned long end,
1760 unsigned long pfn, pgprot_t prot)
1761{
1762 pmd_t *pmd;
1763 unsigned long next;
1764 int err;
1765
1766 pfn -= addr >> PAGE_SHIFT;
1767 pmd = pmd_alloc(mm, pud, addr);
1768 if (!pmd)
1769 return -ENOMEM;
1770 VM_BUG_ON(pmd_trans_huge(*pmd));
1771 do {
1772 next = pmd_addr_end(addr, end);
1773 err = remap_pte_range(mm, pmd, addr, next,
1774 pfn + (addr >> PAGE_SHIFT), prot);
1775 if (err)
1776 return err;
1777 } while (pmd++, addr = next, addr != end);
1778 return 0;
1779}
1780
1781static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d,
1782 unsigned long addr, unsigned long end,
1783 unsigned long pfn, pgprot_t prot)
1784{
1785 pud_t *pud;
1786 unsigned long next;
1787 int err;
1788
1789 pfn -= addr >> PAGE_SHIFT;
1790 pud = pud_alloc(mm, p4d, addr);
1791 if (!pud)
1792 return -ENOMEM;
1793 do {
1794 next = pud_addr_end(addr, end);
1795 err = remap_pmd_range(mm, pud, addr, next,
1796 pfn + (addr >> PAGE_SHIFT), prot);
1797 if (err)
1798 return err;
1799 } while (pud++, addr = next, addr != end);
1800 return 0;
1801}
1802
1803static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
1804 unsigned long addr, unsigned long end,
1805 unsigned long pfn, pgprot_t prot)
1806{
1807 p4d_t *p4d;
1808 unsigned long next;
1809 int err;
1810
1811 pfn -= addr >> PAGE_SHIFT;
1812 p4d = p4d_alloc(mm, pgd, addr);
1813 if (!p4d)
1814 return -ENOMEM;
1815 do {
1816 next = p4d_addr_end(addr, end);
1817 err = remap_pud_range(mm, p4d, addr, next,
1818 pfn + (addr >> PAGE_SHIFT), prot);
1819 if (err)
1820 return err;
1821 } while (p4d++, addr = next, addr != end);
1822 return 0;
1823}
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1836 unsigned long pfn, unsigned long size, pgprot_t prot)
1837{
1838 pgd_t *pgd;
1839 unsigned long next;
1840 unsigned long end = addr + PAGE_ALIGN(size);
1841 struct mm_struct *mm = vma->vm_mm;
1842 unsigned long remap_pfn = pfn;
1843 int err;
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863 if (is_cow_mapping(vma->vm_flags)) {
1864 if (addr != vma->vm_start || end != vma->vm_end)
1865 return -EINVAL;
1866 vma->vm_pgoff = pfn;
1867 }
1868
1869 err = track_pfn_remap(vma, &prot, remap_pfn, addr, PAGE_ALIGN(size));
1870 if (err)
1871 return -EINVAL;
1872
1873 vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
1874
1875 BUG_ON(addr >= end);
1876 pfn -= addr >> PAGE_SHIFT;
1877 pgd = pgd_offset(mm, addr);
1878 flush_cache_range(vma, addr, end);
1879 do {
1880 next = pgd_addr_end(addr, end);
1881 err = remap_p4d_range(mm, pgd, addr, next,
1882 pfn + (addr >> PAGE_SHIFT), prot);
1883 if (err)
1884 break;
1885 } while (pgd++, addr = next, addr != end);
1886
1887 if (err)
1888 untrack_pfn(vma, remap_pfn, PAGE_ALIGN(size));
1889
1890 return err;
1891}
1892EXPORT_SYMBOL(remap_pfn_range);
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
1908{
1909 unsigned long vm_len, pfn, pages;
1910
1911
1912 if (start + len < start)
1913 return -EINVAL;
1914
1915
1916
1917
1918
1919 len += start & ~PAGE_MASK;
1920 pfn = start >> PAGE_SHIFT;
1921 pages = (len + ~PAGE_MASK) >> PAGE_SHIFT;
1922 if (pfn + pages < pfn)
1923 return -EINVAL;
1924
1925
1926 if (vma->vm_pgoff > pages)
1927 return -EINVAL;
1928 pfn += vma->vm_pgoff;
1929 pages -= vma->vm_pgoff;
1930
1931
1932 vm_len = vma->vm_end - vma->vm_start;
1933 if (vm_len >> PAGE_SHIFT > pages)
1934 return -EINVAL;
1935
1936
1937 return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
1938}
1939EXPORT_SYMBOL(vm_iomap_memory);
1940
1941static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
1942 unsigned long addr, unsigned long end,
1943 pte_fn_t fn, void *data)
1944{
1945 pte_t *pte;
1946 int err;
1947 pgtable_t token;
1948 spinlock_t *uninitialized_var(ptl);
1949
1950 pte = (mm == &init_mm) ?
1951 pte_alloc_kernel(pmd, addr) :
1952 pte_alloc_map_lock(mm, pmd, addr, &ptl);
1953 if (!pte)
1954 return -ENOMEM;
1955
1956 BUG_ON(pmd_huge(*pmd));
1957
1958 arch_enter_lazy_mmu_mode();
1959
1960 token = pmd_pgtable(*pmd);
1961
1962 do {
1963 err = fn(pte++, token, addr, data);
1964 if (err)
1965 break;
1966 } while (addr += PAGE_SIZE, addr != end);
1967
1968 arch_leave_lazy_mmu_mode();
1969
1970 if (mm != &init_mm)
1971 pte_unmap_unlock(pte-1, ptl);
1972 return err;
1973}
1974
1975static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
1976 unsigned long addr, unsigned long end,
1977 pte_fn_t fn, void *data)
1978{
1979 pmd_t *pmd;
1980 unsigned long next;
1981 int err;
1982
1983 BUG_ON(pud_huge(*pud));
1984
1985 pmd = pmd_alloc(mm, pud, addr);
1986 if (!pmd)
1987 return -ENOMEM;
1988 do {
1989 next = pmd_addr_end(addr, end);
1990 err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
1991 if (err)
1992 break;
1993 } while (pmd++, addr = next, addr != end);
1994 return err;
1995}
1996
1997static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
1998 unsigned long addr, unsigned long end,
1999 pte_fn_t fn, void *data)
2000{
2001 pud_t *pud;
2002 unsigned long next;
2003 int err;
2004
2005 pud = pud_alloc(mm, p4d, addr);
2006 if (!pud)
2007 return -ENOMEM;
2008 do {
2009 next = pud_addr_end(addr, end);
2010 err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
2011 if (err)
2012 break;
2013 } while (pud++, addr = next, addr != end);
2014 return err;
2015}
2016
2017static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
2018 unsigned long addr, unsigned long end,
2019 pte_fn_t fn, void *data)
2020{
2021 p4d_t *p4d;
2022 unsigned long next;
2023 int err;
2024
2025 p4d = p4d_alloc(mm, pgd, addr);
2026 if (!p4d)
2027 return -ENOMEM;
2028 do {
2029 next = p4d_addr_end(addr, end);
2030 err = apply_to_pud_range(mm, p4d, addr, next, fn, data);
2031 if (err)
2032 break;
2033 } while (p4d++, addr = next, addr != end);
2034 return err;
2035}
2036
2037
2038
2039
2040
2041int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
2042 unsigned long size, pte_fn_t fn, void *data)
2043{
2044 pgd_t *pgd;
2045 unsigned long next;
2046 unsigned long end = addr + size;
2047 int err;
2048
2049 if (WARN_ON(addr >= end))
2050 return -EINVAL;
2051
2052 pgd = pgd_offset(mm, addr);
2053 do {
2054 next = pgd_addr_end(addr, end);
2055 err = apply_to_p4d_range(mm, pgd, addr, next, fn, data);
2056 if (err)
2057 break;
2058 } while (pgd++, addr = next, addr != end);
2059
2060 return err;
2061}
2062EXPORT_SYMBOL_GPL(apply_to_page_range);
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
2073 pte_t *page_table, pte_t orig_pte)
2074{
2075 int same = 1;
2076#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
2077 if (sizeof(pte_t) > sizeof(unsigned long)) {
2078 spinlock_t *ptl = pte_lockptr(mm, pmd);
2079 spin_lock(ptl);
2080 same = pte_same(*page_table, orig_pte);
2081 spin_unlock(ptl);
2082 }
2083#endif
2084 pte_unmap(page_table);
2085 return same;
2086}
2087
2088static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
2089{
2090 debug_dma_assert_idle(src);
2091
2092
2093
2094
2095
2096
2097
2098 if (unlikely(!src)) {
2099 void *kaddr = kmap_atomic(dst);
2100 void __user *uaddr = (void __user *)(va & PAGE_MASK);
2101
2102
2103
2104
2105
2106
2107
2108 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
2109 clear_page(kaddr);
2110 kunmap_atomic(kaddr);
2111 flush_dcache_page(dst);
2112 } else
2113 copy_user_highpage(dst, src, va, vma);
2114}
2115
2116static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
2117{
2118 struct file *vm_file = vma->vm_file;
2119
2120 if (vm_file)
2121 return mapping_gfp_mask(vm_file->f_mapping) | __GFP_FS | __GFP_IO;
2122
2123
2124
2125
2126
2127 return GFP_KERNEL;
2128}
2129
2130
2131
2132
2133
2134
2135
2136static vm_fault_t do_page_mkwrite(struct vm_fault *vmf)
2137{
2138 vm_fault_t ret;
2139 struct page *page = vmf->page;
2140 unsigned int old_flags = vmf->flags;
2141
2142 vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
2143
2144 ret = vmf->vma->vm_ops->page_mkwrite(vmf);
2145
2146 vmf->flags = old_flags;
2147 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
2148 return ret;
2149 if (unlikely(!(ret & VM_FAULT_LOCKED))) {
2150 lock_page(page);
2151 if (!page->mapping) {
2152 unlock_page(page);
2153 return 0;
2154 }
2155 ret |= VM_FAULT_LOCKED;
2156 } else
2157 VM_BUG_ON_PAGE(!PageLocked(page), page);
2158 return ret;
2159}
2160
2161
2162
2163
2164
2165
2166static void fault_dirty_shared_page(struct vm_area_struct *vma,
2167 struct page *page)
2168{
2169 struct address_space *mapping;
2170 bool dirtied;
2171 bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;
2172
2173 dirtied = set_page_dirty(page);
2174 VM_BUG_ON_PAGE(PageAnon(page), page);
2175
2176
2177
2178
2179
2180
2181 mapping = page_rmapping(page);
2182 unlock_page(page);
2183
2184 if ((dirtied || page_mkwrite) && mapping) {
2185
2186
2187
2188
2189 balance_dirty_pages_ratelimited(mapping);
2190 }
2191
2192 if (!page_mkwrite)
2193 file_update_time(vma->vm_file);
2194}
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204static inline void wp_page_reuse(struct vm_fault *vmf)
2205 __releases(vmf->ptl)
2206{
2207 struct vm_area_struct *vma = vmf->vma;
2208 struct page *page = vmf->page;
2209 pte_t entry;
2210
2211
2212
2213
2214
2215 if (page)
2216 page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);
2217
2218 flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
2219 entry = pte_mkyoung(vmf->orig_pte);
2220 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2221 if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
2222 update_mmu_cache(vma, vmf->address, vmf->pte);
2223 pte_unmap_unlock(vmf->pte, vmf->ptl);
2224}
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242static vm_fault_t wp_page_copy(struct vm_fault *vmf)
2243{
2244 struct vm_area_struct *vma = vmf->vma;
2245 struct mm_struct *mm = vma->vm_mm;
2246 struct page *old_page = vmf->page;
2247 struct page *new_page = NULL;
2248 pte_t entry;
2249 int page_copied = 0;
2250 struct mem_cgroup *memcg;
2251 struct mmu_notifier_range range;
2252
2253 if (unlikely(anon_vma_prepare(vma)))
2254 goto oom;
2255
2256 if (is_zero_pfn(pte_pfn(vmf->orig_pte))) {
2257 new_page = alloc_zeroed_user_highpage_movable(vma,
2258 vmf->address);
2259 if (!new_page)
2260 goto oom;
2261 } else {
2262 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
2263 vmf->address);
2264 if (!new_page)
2265 goto oom;
2266 cow_user_page(new_page, old_page, vmf->address, vma);
2267 }
2268
2269 if (mem_cgroup_try_charge_delay(new_page, mm, GFP_KERNEL, &memcg, false))
2270 goto oom_free_new;
2271
2272 __SetPageUptodate(new_page);
2273
2274 mmu_notifier_range_init(&range, mm, vmf->address & PAGE_MASK,
2275 (vmf->address & PAGE_MASK) + PAGE_SIZE);
2276 mmu_notifier_invalidate_range_start(&range);
2277
2278
2279
2280
2281 vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
2282 if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
2283 if (old_page) {
2284 if (!PageAnon(old_page)) {
2285 dec_mm_counter_fast(mm,
2286 mm_counter_file(old_page));
2287 inc_mm_counter_fast(mm, MM_ANONPAGES);
2288 }
2289 } else {
2290 inc_mm_counter_fast(mm, MM_ANONPAGES);
2291 }
2292 flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
2293 entry = mk_pte(new_page, vma->vm_page_prot);
2294 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2295
2296
2297
2298
2299
2300
2301 ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
2302 page_add_new_anon_rmap(new_page, vma, vmf->address, false);
2303 mem_cgroup_commit_charge(new_page, memcg, false, false);
2304 lru_cache_add_active_or_unevictable(new_page, vma);
2305
2306
2307
2308
2309
2310 set_pte_at_notify(mm, vmf->address, vmf->pte, entry);
2311 update_mmu_cache(vma, vmf->address, vmf->pte);
2312 if (old_page) {
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335 page_remove_rmap(old_page, false);
2336 }
2337
2338
2339 new_page = old_page;
2340 page_copied = 1;
2341 } else {
2342 mem_cgroup_cancel_charge(new_page, memcg, false);
2343 }
2344
2345 if (new_page)
2346 put_page(new_page);
2347
2348 pte_unmap_unlock(vmf->pte, vmf->ptl);
2349
2350
2351
2352
2353 mmu_notifier_invalidate_range_only_end(&range);
2354 if (old_page) {
2355
2356
2357
2358
2359 if (page_copied && (vma->vm_flags & VM_LOCKED)) {
2360 lock_page(old_page);
2361 if (PageMlocked(old_page))
2362 munlock_vma_page(old_page);
2363 unlock_page(old_page);
2364 }
2365 put_page(old_page);
2366 }
2367 return page_copied ? VM_FAULT_WRITE : 0;
2368oom_free_new:
2369 put_page(new_page);
2370oom:
2371 if (old_page)
2372 put_page(old_page);
2373 return VM_FAULT_OOM;
2374}
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf)
2392{
2393 WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
2394 vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
2395 &vmf->ptl);
2396
2397
2398
2399
2400 if (!pte_same(*vmf->pte, vmf->orig_pte)) {
2401 pte_unmap_unlock(vmf->pte, vmf->ptl);
2402 return VM_FAULT_NOPAGE;
2403 }
2404 wp_page_reuse(vmf);
2405 return 0;
2406}
2407
2408
2409
2410
2411
2412static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
2413{
2414 struct vm_area_struct *vma = vmf->vma;
2415
2416 if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
2417 vm_fault_t ret;
2418
2419 pte_unmap_unlock(vmf->pte, vmf->ptl);
2420 vmf->flags |= FAULT_FLAG_MKWRITE;
2421 ret = vma->vm_ops->pfn_mkwrite(vmf);
2422 if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
2423 return ret;
2424 return finish_mkwrite_fault(vmf);
2425 }
2426 wp_page_reuse(vmf);
2427 return VM_FAULT_WRITE;
2428}
2429
2430static vm_fault_t wp_page_shared(struct vm_fault *vmf)
2431 __releases(vmf->ptl)
2432{
2433 struct vm_area_struct *vma = vmf->vma;
2434
2435 get_page(vmf->page);
2436
2437 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
2438 vm_fault_t tmp;
2439
2440 pte_unmap_unlock(vmf->pte, vmf->ptl);
2441 tmp = do_page_mkwrite(vmf);
2442 if (unlikely(!tmp || (tmp &
2443 (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
2444 put_page(vmf->page);
2445 return tmp;
2446 }
2447 tmp = finish_mkwrite_fault(vmf);
2448 if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
2449 unlock_page(vmf->page);
2450 put_page(vmf->page);
2451 return tmp;
2452 }
2453 } else {
2454 wp_page_reuse(vmf);
2455 lock_page(vmf->page);
2456 }
2457 fault_dirty_shared_page(vma, vmf->page);
2458 put_page(vmf->page);
2459
2460 return VM_FAULT_WRITE;
2461}
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481static vm_fault_t do_wp_page(struct vm_fault *vmf)
2482 __releases(vmf->ptl)
2483{
2484 struct vm_area_struct *vma = vmf->vma;
2485
2486 vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
2487 if (!vmf->page) {
2488
2489
2490
2491
2492
2493
2494
2495 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2496 (VM_WRITE|VM_SHARED))
2497 return wp_pfn_shared(vmf);
2498
2499 pte_unmap_unlock(vmf->pte, vmf->ptl);
2500 return wp_page_copy(vmf);
2501 }
2502
2503
2504
2505
2506
2507 if (PageAnon(vmf->page) && !PageKsm(vmf->page)) {
2508 int total_map_swapcount;
2509 if (!trylock_page(vmf->page)) {
2510 get_page(vmf->page);
2511 pte_unmap_unlock(vmf->pte, vmf->ptl);
2512 lock_page(vmf->page);
2513 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
2514 vmf->address, &vmf->ptl);
2515 if (!pte_same(*vmf->pte, vmf->orig_pte)) {
2516 unlock_page(vmf->page);
2517 pte_unmap_unlock(vmf->pte, vmf->ptl);
2518 put_page(vmf->page);
2519 return 0;
2520 }
2521 put_page(vmf->page);
2522 }
2523 if (reuse_swap_page(vmf->page, &total_map_swapcount)) {
2524 if (total_map_swapcount == 1) {
2525
2526
2527
2528
2529
2530
2531
2532 page_move_anon_rmap(vmf->page, vma);
2533 }
2534 unlock_page(vmf->page);
2535 wp_page_reuse(vmf);
2536 return VM_FAULT_WRITE;
2537 }
2538 unlock_page(vmf->page);
2539 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2540 (VM_WRITE|VM_SHARED))) {
2541 return wp_page_shared(vmf);
2542 }
2543
2544
2545
2546
2547 get_page(vmf->page);
2548
2549 pte_unmap_unlock(vmf->pte, vmf->ptl);
2550 return wp_page_copy(vmf);
2551}
2552
2553static void unmap_mapping_range_vma(struct vm_area_struct *vma,
2554 unsigned long start_addr, unsigned long end_addr,
2555 struct zap_details *details)
2556{
2557 zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
2558}
2559
2560static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
2561 struct zap_details *details)
2562{
2563 struct vm_area_struct *vma;
2564 pgoff_t vba, vea, zba, zea;
2565
2566 vma_interval_tree_foreach(vma, root,
2567 details->first_index, details->last_index) {
2568
2569 vba = vma->vm_pgoff;
2570 vea = vba + vma_pages(vma) - 1;
2571 zba = details->first_index;
2572 if (zba < vba)
2573 zba = vba;
2574 zea = details->last_index;
2575 if (zea > vea)
2576 zea = vea;
2577
2578 unmap_mapping_range_vma(vma,
2579 ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
2580 ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
2581 details);
2582 }
2583}
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597void unmap_mapping_pages(struct address_space *mapping, pgoff_t start,
2598 pgoff_t nr, bool even_cows)
2599{
2600 struct zap_details details = { };
2601
2602 details.check_mapping = even_cows ? NULL : mapping;
2603 details.first_index = start;
2604 details.last_index = start + nr - 1;
2605 if (details.last_index < details.first_index)
2606 details.last_index = ULONG_MAX;
2607
2608 i_mmap_lock_write(mapping);
2609 if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
2610 unmap_mapping_range_tree(&mapping->i_mmap, &details);
2611 i_mmap_unlock_write(mapping);
2612}
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631void unmap_mapping_range(struct address_space *mapping,
2632 loff_t const holebegin, loff_t const holelen, int even_cows)
2633{
2634 pgoff_t hba = holebegin >> PAGE_SHIFT;
2635 pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2636
2637
2638 if (sizeof(holelen) > sizeof(hlen)) {
2639 long long holeend =
2640 (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2641 if (holeend & ~(long long)ULONG_MAX)
2642 hlen = ULONG_MAX - hba + 1;
2643 }
2644
2645 unmap_mapping_pages(mapping, hba, hlen, even_cows);
2646}
2647EXPORT_SYMBOL(unmap_mapping_range);
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657vm_fault_t do_swap_page(struct vm_fault *vmf)
2658{
2659 struct vm_area_struct *vma = vmf->vma;
2660 struct page *page = NULL, *swapcache;
2661 struct mem_cgroup *memcg;
2662 swp_entry_t entry;
2663 pte_t pte;
2664 int locked;
2665 int exclusive = 0;
2666 vm_fault_t ret = 0;
2667
2668 if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
2669 goto out;
2670
2671 entry = pte_to_swp_entry(vmf->orig_pte);
2672 if (unlikely(non_swap_entry(entry))) {
2673 if (is_migration_entry(entry)) {
2674 migration_entry_wait(vma->vm_mm, vmf->pmd,
2675 vmf->address);
2676 } else if (is_device_private_entry(entry)) {
2677
2678
2679
2680
2681
2682 ret = device_private_entry_fault(vma, vmf->address, entry,
2683 vmf->flags, vmf->pmd);
2684 } else if (is_hwpoison_entry(entry)) {
2685 ret = VM_FAULT_HWPOISON;
2686 } else {
2687 print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
2688 ret = VM_FAULT_SIGBUS;
2689 }
2690 goto out;
2691 }
2692
2693
2694 delayacct_set_flag(DELAYACCT_PF_SWAPIN);
2695 page = lookup_swap_cache(entry, vma, vmf->address);
2696 swapcache = page;
2697
2698 if (!page) {
2699 struct swap_info_struct *si = swp_swap_info(entry);
2700
2701 if (si->flags & SWP_SYNCHRONOUS_IO &&
2702 __swap_count(si, entry) == 1) {
2703
2704 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
2705 vmf->address);
2706 if (page) {
2707 __SetPageLocked(page);
2708 __SetPageSwapBacked(page);
2709 set_page_private(page, entry.val);
2710 lru_cache_add_anon(page);
2711 swap_readpage(page, true);
2712 }
2713 } else {
2714 page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
2715 vmf);
2716 swapcache = page;
2717 }
2718
2719 if (!page) {
2720
2721
2722
2723
2724 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
2725 vmf->address, &vmf->ptl);
2726 if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
2727 ret = VM_FAULT_OOM;
2728 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2729 goto unlock;
2730 }
2731
2732
2733 ret = VM_FAULT_MAJOR;
2734 count_vm_event(PGMAJFAULT);
2735 count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
2736 } else if (PageHWPoison(page)) {
2737
2738
2739
2740
2741 ret = VM_FAULT_HWPOISON;
2742 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2743 goto out_release;
2744 }
2745
2746 locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
2747
2748 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2749 if (!locked) {
2750 ret |= VM_FAULT_RETRY;
2751 goto out_release;
2752 }
2753
2754
2755
2756
2757
2758
2759
2760 if (unlikely((!PageSwapCache(page) ||
2761 page_private(page) != entry.val)) && swapcache)
2762 goto out_page;
2763
2764 page = ksm_might_need_to_copy(page, vma, vmf->address);
2765 if (unlikely(!page)) {
2766 ret = VM_FAULT_OOM;
2767 page = swapcache;
2768 goto out_page;
2769 }
2770
2771 if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL,
2772 &memcg, false)) {
2773 ret = VM_FAULT_OOM;
2774 goto out_page;
2775 }
2776
2777
2778
2779
2780 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
2781 &vmf->ptl);
2782 if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte)))
2783 goto out_nomap;
2784
2785 if (unlikely(!PageUptodate(page))) {
2786 ret = VM_FAULT_SIGBUS;
2787 goto out_nomap;
2788 }
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
2801 dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
2802 pte = mk_pte(page, vma->vm_page_prot);
2803 if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
2804 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
2805 vmf->flags &= ~FAULT_FLAG_WRITE;
2806 ret |= VM_FAULT_WRITE;
2807 exclusive = RMAP_EXCLUSIVE;
2808 }
2809 flush_icache_page(vma, page);
2810 if (pte_swp_soft_dirty(vmf->orig_pte))
2811 pte = pte_mksoft_dirty(pte);
2812 set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
2813 arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
2814 vmf->orig_pte = pte;
2815
2816
2817 if (unlikely(page != swapcache && swapcache)) {
2818 page_add_new_anon_rmap(page, vma, vmf->address, false);
2819 mem_cgroup_commit_charge(page, memcg, false, false);
2820 lru_cache_add_active_or_unevictable(page, vma);
2821 } else {
2822 do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
2823 mem_cgroup_commit_charge(page, memcg, true, false);
2824 activate_page(page);
2825 }
2826
2827 swap_free(entry);
2828 if (mem_cgroup_swap_full(page) ||
2829 (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
2830 try_to_free_swap(page);
2831 unlock_page(page);
2832 if (page != swapcache && swapcache) {
2833
2834
2835
2836
2837
2838
2839
2840
2841 unlock_page(swapcache);
2842 put_page(swapcache);
2843 }
2844
2845 if (vmf->flags & FAULT_FLAG_WRITE) {
2846 ret |= do_wp_page(vmf);
2847 if (ret & VM_FAULT_ERROR)
2848 ret &= VM_FAULT_ERROR;
2849 goto out;
2850 }
2851
2852
2853 update_mmu_cache(vma, vmf->address, vmf->pte);
2854unlock:
2855 pte_unmap_unlock(vmf->pte, vmf->ptl);
2856out:
2857 return ret;
2858out_nomap:
2859 mem_cgroup_cancel_charge(page, memcg, false);
2860 pte_unmap_unlock(vmf->pte, vmf->ptl);
2861out_page:
2862 unlock_page(page);
2863out_release:
2864 put_page(page);
2865 if (page != swapcache && swapcache) {
2866 unlock_page(swapcache);
2867 put_page(swapcache);
2868 }
2869 return ret;
2870}
2871
2872
2873
2874
2875
2876
2877static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
2878{
2879 struct vm_area_struct *vma = vmf->vma;
2880 struct mem_cgroup *memcg;
2881 struct page *page;
2882 vm_fault_t ret = 0;
2883 pte_t entry;
2884
2885
2886 if (vma->vm_flags & VM_SHARED)
2887 return VM_FAULT_SIGBUS;
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899 if (pte_alloc(vma->vm_mm, vmf->pmd))
2900 return VM_FAULT_OOM;
2901
2902
2903 if (unlikely(pmd_trans_unstable(vmf->pmd)))
2904 return 0;
2905
2906
2907 if (!(vmf->flags & FAULT_FLAG_WRITE) &&
2908 !mm_forbids_zeropage(vma->vm_mm)) {
2909 entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address),
2910 vma->vm_page_prot));
2911 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
2912 vmf->address, &vmf->ptl);
2913 if (!pte_none(*vmf->pte))
2914 goto unlock;
2915 ret = check_stable_address_space(vma->vm_mm);
2916 if (ret)
2917 goto unlock;
2918
2919 if (userfaultfd_missing(vma)) {
2920 pte_unmap_unlock(vmf->pte, vmf->ptl);
2921 return handle_userfault(vmf, VM_UFFD_MISSING);
2922 }
2923 goto setpte;
2924 }
2925
2926
2927 if (unlikely(anon_vma_prepare(vma)))
2928 goto oom;
2929 page = alloc_zeroed_user_highpage_movable(vma, vmf->address);
2930 if (!page)
2931 goto oom;
2932
2933 if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL, &memcg,
2934 false))
2935 goto oom_free_page;
2936
2937
2938
2939
2940
2941
2942 __SetPageUptodate(page);
2943
2944 entry = mk_pte(page, vma->vm_page_prot);
2945 if (vma->vm_flags & VM_WRITE)
2946 entry = pte_mkwrite(pte_mkdirty(entry));
2947
2948 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
2949 &vmf->ptl);
2950 if (!pte_none(*vmf->pte))
2951 goto release;
2952
2953 ret = check_stable_address_space(vma->vm_mm);
2954 if (ret)
2955 goto release;
2956
2957
2958 if (userfaultfd_missing(vma)) {
2959 pte_unmap_unlock(vmf->pte, vmf->ptl);
2960 mem_cgroup_cancel_charge(page, memcg, false);
2961 put_page(page);
2962 return handle_userfault(vmf, VM_UFFD_MISSING);
2963 }
2964
2965 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
2966 page_add_new_anon_rmap(page, vma, vmf->address, false);
2967 mem_cgroup_commit_charge(page, memcg, false, false);
2968 lru_cache_add_active_or_unevictable(page, vma);
2969setpte:
2970 set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
2971
2972
2973 update_mmu_cache(vma, vmf->address, vmf->pte);
2974unlock:
2975 pte_unmap_unlock(vmf->pte, vmf->ptl);
2976 return ret;
2977release:
2978 mem_cgroup_cancel_charge(page, memcg, false);
2979 put_page(page);
2980 goto unlock;
2981oom_free_page:
2982 put_page(page);
2983oom:
2984 return VM_FAULT_OOM;
2985}
2986
2987
2988
2989
2990
2991
2992static vm_fault_t __do_fault(struct vm_fault *vmf)
2993{
2994 struct vm_area_struct *vma = vmf->vma;
2995 vm_fault_t ret;
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012 if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) {
3013 vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
3014 if (!vmf->prealloc_pte)
3015 return VM_FAULT_OOM;
3016 smp_wmb();
3017 }
3018
3019 ret = vma->vm_ops->fault(vmf);
3020 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
3021 VM_FAULT_DONE_COW)))
3022 return ret;
3023
3024 if (unlikely(PageHWPoison(vmf->page))) {
3025 if (ret & VM_FAULT_LOCKED)
3026 unlock_page(vmf->page);
3027 put_page(vmf->page);
3028 vmf->page = NULL;
3029 return VM_FAULT_HWPOISON;
3030 }
3031
3032 if (unlikely(!(ret & VM_FAULT_LOCKED)))
3033 lock_page(vmf->page);
3034 else
3035 VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page);
3036
3037 return ret;
3038}
3039
3040
3041
3042
3043
3044
3045
3046static int pmd_devmap_trans_unstable(pmd_t *pmd)
3047{
3048 return pmd_devmap(*pmd) || pmd_trans_unstable(pmd);
3049}
3050
3051static vm_fault_t pte_alloc_one_map(struct vm_fault *vmf)
3052{
3053 struct vm_area_struct *vma = vmf->vma;
3054
3055 if (!pmd_none(*vmf->pmd))
3056 goto map_pte;
3057 if (vmf->prealloc_pte) {
3058 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
3059 if (unlikely(!pmd_none(*vmf->pmd))) {
3060 spin_unlock(vmf->ptl);
3061 goto map_pte;
3062 }
3063
3064 mm_inc_nr_ptes(vma->vm_mm);
3065 pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
3066 spin_unlock(vmf->ptl);
3067 vmf->prealloc_pte = NULL;
3068 } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) {
3069 return VM_FAULT_OOM;
3070 }
3071map_pte:
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083 if (pmd_devmap_trans_unstable(vmf->pmd))
3084 return VM_FAULT_NOPAGE;
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
3096 &vmf->ptl);
3097 return 0;
3098}
3099
3100#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
3101
3102#define HPAGE_CACHE_INDEX_MASK (HPAGE_PMD_NR - 1)
3103static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
3104 unsigned long haddr)
3105{
3106 if (((vma->vm_start >> PAGE_SHIFT) & HPAGE_CACHE_INDEX_MASK) !=
3107 (vma->vm_pgoff & HPAGE_CACHE_INDEX_MASK))
3108 return false;
3109 if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
3110 return false;
3111 return true;
3112}
3113
3114static void deposit_prealloc_pte(struct vm_fault *vmf)
3115{
3116 struct vm_area_struct *vma = vmf->vma;
3117
3118 pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
3119
3120
3121
3122
3123 mm_inc_nr_ptes(vma->vm_mm);
3124 vmf->prealloc_pte = NULL;
3125}
3126
3127static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
3128{
3129 struct vm_area_struct *vma = vmf->vma;
3130 bool write = vmf->flags & FAULT_FLAG_WRITE;
3131 unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
3132 pmd_t entry;
3133 int i;
3134 vm_fault_t ret;
3135
3136 if (!transhuge_vma_suitable(vma, haddr))
3137 return VM_FAULT_FALLBACK;
3138
3139 ret = VM_FAULT_FALLBACK;
3140 page = compound_head(page);
3141
3142
3143
3144
3145
3146 if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
3147 vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
3148 if (!vmf->prealloc_pte)
3149 return VM_FAULT_OOM;
3150 smp_wmb();
3151 }
3152
3153 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
3154 if (unlikely(!pmd_none(*vmf->pmd)))
3155 goto out;
3156
3157 for (i = 0; i < HPAGE_PMD_NR; i++)
3158 flush_icache_page(vma, page + i);
3159
3160 entry = mk_huge_pmd(page, vma->vm_page_prot);
3161 if (write)
3162 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
3163
3164 add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR);
3165 page_add_file_rmap(page, true);
3166
3167
3168
3169 if (arch_needs_pgtable_deposit())
3170 deposit_prealloc_pte(vmf);
3171
3172 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
3173
3174 update_mmu_cache_pmd(vma, haddr, vmf->pmd);
3175
3176
3177 ret = 0;
3178 count_vm_event(THP_FILE_MAPPED);
3179out:
3180 spin_unlock(vmf->ptl);
3181 return ret;
3182}
3183#else
3184static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
3185{
3186 BUILD_BUG();
3187 return 0;
3188}
3189#endif
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
3206 struct page *page)
3207{
3208 struct vm_area_struct *vma = vmf->vma;
3209 bool write = vmf->flags & FAULT_FLAG_WRITE;
3210 pte_t entry;
3211 vm_fault_t ret;
3212
3213 if (pmd_none(*vmf->pmd) && PageTransCompound(page) &&
3214 IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) {
3215
3216 VM_BUG_ON_PAGE(memcg, page);
3217
3218 ret = do_set_pmd(vmf, page);
3219 if (ret != VM_FAULT_FALLBACK)
3220 return ret;
3221 }
3222
3223 if (!vmf->pte) {
3224 ret = pte_alloc_one_map(vmf);
3225 if (ret)
3226 return ret;
3227 }
3228
3229
3230 if (unlikely(!pte_none(*vmf->pte)))
3231 return VM_FAULT_NOPAGE;
3232
3233 flush_icache_page(vma, page);
3234 entry = mk_pte(page, vma->vm_page_prot);
3235 if (write)
3236 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
3237
3238 if (write && !(vma->vm_flags & VM_SHARED)) {
3239 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
3240 page_add_new_anon_rmap(page, vma, vmf->address, false);
3241 mem_cgroup_commit_charge(page, memcg, false, false);
3242 lru_cache_add_active_or_unevictable(page, vma);
3243 } else {
3244 inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
3245 page_add_file_rmap(page, false);
3246 }
3247 set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
3248
3249
3250 update_mmu_cache(vma, vmf->address, vmf->pte);
3251
3252 return 0;
3253}
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270vm_fault_t finish_fault(struct vm_fault *vmf)
3271{
3272 struct page *page;
3273 vm_fault_t ret = 0;
3274
3275
3276 if ((vmf->flags & FAULT_FLAG_WRITE) &&
3277 !(vmf->vma->vm_flags & VM_SHARED))
3278 page = vmf->cow_page;
3279 else
3280 page = vmf->page;
3281
3282
3283
3284
3285
3286 if (!(vmf->vma->vm_flags & VM_SHARED))
3287 ret = check_stable_address_space(vmf->vma->vm_mm);
3288 if (!ret)
3289 ret = alloc_set_pte(vmf, vmf->memcg, page);
3290 if (vmf->pte)
3291 pte_unmap_unlock(vmf->pte, vmf->ptl);
3292 return ret;
3293}
3294
3295static unsigned long fault_around_bytes __read_mostly =
3296 rounddown_pow_of_two(65536);
3297
3298#ifdef CONFIG_DEBUG_FS
3299static int fault_around_bytes_get(void *data, u64 *val)
3300{
3301 *val = fault_around_bytes;
3302 return 0;
3303}
3304
3305
3306
3307
3308
3309static int fault_around_bytes_set(void *data, u64 val)
3310{
3311 if (val / PAGE_SIZE > PTRS_PER_PTE)
3312 return -EINVAL;
3313 if (val > PAGE_SIZE)
3314 fault_around_bytes = rounddown_pow_of_two(val);
3315 else
3316 fault_around_bytes = PAGE_SIZE;
3317 return 0;
3318}
3319DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops,
3320 fault_around_bytes_get, fault_around_bytes_set, "%llu\n");
3321
3322static int __init fault_around_debugfs(void)
3323{
3324 void *ret;
3325
3326 ret = debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL,
3327 &fault_around_bytes_fops);
3328 if (!ret)
3329 pr_warn("Failed to create fault_around_bytes in debugfs");
3330 return 0;
3331}
3332late_initcall(fault_around_debugfs);
3333#endif
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359static vm_fault_t do_fault_around(struct vm_fault *vmf)
3360{
3361 unsigned long address = vmf->address, nr_pages, mask;
3362 pgoff_t start_pgoff = vmf->pgoff;
3363 pgoff_t end_pgoff;
3364 int off;
3365 vm_fault_t ret = 0;
3366
3367 nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
3368 mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
3369
3370 vmf->address = max(address & mask, vmf->vma->vm_start);
3371 off = ((address - vmf->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
3372 start_pgoff -= off;
3373
3374
3375
3376
3377
3378 end_pgoff = start_pgoff -
3379 ((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
3380 PTRS_PER_PTE - 1;
3381 end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1,
3382 start_pgoff + nr_pages - 1);
3383
3384 if (pmd_none(*vmf->pmd)) {
3385 vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
3386 if (!vmf->prealloc_pte)
3387 goto out;
3388 smp_wmb();
3389 }
3390
3391 vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);
3392
3393
3394 if (pmd_trans_huge(*vmf->pmd)) {
3395 ret = VM_FAULT_NOPAGE;
3396 goto out;
3397 }
3398
3399
3400 if (!vmf->pte)
3401 goto out;
3402
3403
3404 vmf->pte -= (vmf->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT);
3405 if (!pte_none(*vmf->pte))
3406 ret = VM_FAULT_NOPAGE;
3407 pte_unmap_unlock(vmf->pte, vmf->ptl);
3408out:
3409 vmf->address = address;
3410 vmf->pte = NULL;
3411 return ret;
3412}
3413
3414static vm_fault_t do_read_fault(struct vm_fault *vmf)
3415{
3416 struct vm_area_struct *vma = vmf->vma;
3417 vm_fault_t ret = 0;
3418
3419
3420
3421
3422
3423
3424 if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
3425 ret = do_fault_around(vmf);
3426 if (ret)
3427 return ret;
3428 }
3429
3430 ret = __do_fault(vmf);
3431 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3432 return ret;
3433
3434 ret |= finish_fault(vmf);
3435 unlock_page(vmf->page);
3436 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3437 put_page(vmf->page);
3438 return ret;
3439}
3440
3441static vm_fault_t do_cow_fault(struct vm_fault *vmf)
3442{
3443 struct vm_area_struct *vma = vmf->vma;
3444 vm_fault_t ret;
3445
3446 if (unlikely(anon_vma_prepare(vma)))
3447 return VM_FAULT_OOM;
3448
3449 vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address);
3450 if (!vmf->cow_page)
3451 return VM_FAULT_OOM;
3452
3453 if (mem_cgroup_try_charge_delay(vmf->cow_page, vma->vm_mm, GFP_KERNEL,
3454 &vmf->memcg, false)) {
3455 put_page(vmf->cow_page);
3456 return VM_FAULT_OOM;
3457 }
3458
3459 ret = __do_fault(vmf);
3460 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3461 goto uncharge_out;
3462 if (ret & VM_FAULT_DONE_COW)
3463 return ret;
3464
3465 copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma);
3466 __SetPageUptodate(vmf->cow_page);
3467
3468 ret |= finish_fault(vmf);
3469 unlock_page(vmf->page);
3470 put_page(vmf->page);
3471 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3472 goto uncharge_out;
3473 return ret;
3474uncharge_out:
3475 mem_cgroup_cancel_charge(vmf->cow_page, vmf->memcg, false);
3476 put_page(vmf->cow_page);
3477 return ret;
3478}
3479
3480static vm_fault_t do_shared_fault(struct vm_fault *vmf)
3481{
3482 struct vm_area_struct *vma = vmf->vma;
3483 vm_fault_t ret, tmp;
3484
3485 ret = __do_fault(vmf);
3486 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3487 return ret;
3488
3489
3490
3491
3492
3493 if (vma->vm_ops->page_mkwrite) {
3494 unlock_page(vmf->page);
3495 tmp = do_page_mkwrite(vmf);
3496 if (unlikely(!tmp ||
3497 (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
3498 put_page(vmf->page);
3499 return tmp;
3500 }
3501 }
3502
3503 ret |= finish_fault(vmf);
3504 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
3505 VM_FAULT_RETRY))) {
3506 unlock_page(vmf->page);
3507 put_page(vmf->page);
3508 return ret;
3509 }
3510
3511 fault_dirty_shared_page(vma, vmf->page);
3512 return ret;
3513}
3514
3515
3516
3517
3518
3519
3520
3521static vm_fault_t do_fault(struct vm_fault *vmf)
3522{
3523 struct vm_area_struct *vma = vmf->vma;
3524 vm_fault_t ret;
3525
3526
3527
3528
3529 if (!vma->vm_ops->fault) {
3530
3531
3532
3533
3534 if (unlikely(!pmd_present(*vmf->pmd)))
3535 ret = VM_FAULT_SIGBUS;
3536 else {
3537 vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm,
3538 vmf->pmd,
3539 vmf->address,
3540 &vmf->ptl);
3541
3542
3543
3544
3545
3546
3547
3548 if (unlikely(pte_none(*vmf->pte)))
3549 ret = VM_FAULT_SIGBUS;
3550 else
3551 ret = VM_FAULT_NOPAGE;
3552
3553 pte_unmap_unlock(vmf->pte, vmf->ptl);
3554 }
3555 } else if (!(vmf->flags & FAULT_FLAG_WRITE))
3556 ret = do_read_fault(vmf);
3557 else if (!(vma->vm_flags & VM_SHARED))
3558 ret = do_cow_fault(vmf);
3559 else
3560 ret = do_shared_fault(vmf);
3561
3562
3563 if (vmf->prealloc_pte) {
3564 pte_free(vma->vm_mm, vmf->prealloc_pte);
3565 vmf->prealloc_pte = NULL;
3566 }
3567 return ret;
3568}
3569
3570static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
3571 unsigned long addr, int page_nid,
3572 int *flags)
3573{
3574 get_page(page);
3575
3576 count_vm_numa_event(NUMA_HINT_FAULTS);
3577 if (page_nid == numa_node_id()) {
3578 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
3579 *flags |= TNF_FAULT_LOCAL;
3580 }
3581
3582 return mpol_misplaced(page, vma, addr);
3583}
3584
3585static vm_fault_t do_numa_page(struct vm_fault *vmf)
3586{
3587 struct vm_area_struct *vma = vmf->vma;
3588 struct page *page = NULL;
3589 int page_nid = -1;
3590 int last_cpupid;
3591 int target_nid;
3592 bool migrated = false;
3593 pte_t pte;
3594 bool was_writable = pte_savedwrite(vmf->orig_pte);
3595 int flags = 0;
3596
3597
3598
3599
3600
3601
3602 vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd);
3603 spin_lock(vmf->ptl);
3604 if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
3605 pte_unmap_unlock(vmf->pte, vmf->ptl);
3606 goto out;
3607 }
3608
3609
3610
3611
3612
3613 pte = ptep_modify_prot_start(vma->vm_mm, vmf->address, vmf->pte);
3614 pte = pte_modify(pte, vma->vm_page_prot);
3615 pte = pte_mkyoung(pte);
3616 if (was_writable)
3617 pte = pte_mkwrite(pte);
3618 ptep_modify_prot_commit(vma->vm_mm, vmf->address, vmf->pte, pte);
3619 update_mmu_cache(vma, vmf->address, vmf->pte);
3620
3621 page = vm_normal_page(vma, vmf->address, pte);
3622 if (!page) {
3623 pte_unmap_unlock(vmf->pte, vmf->ptl);
3624 return 0;
3625 }
3626
3627
3628 if (PageCompound(page)) {
3629 pte_unmap_unlock(vmf->pte, vmf->ptl);
3630 return 0;
3631 }
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641 if (!pte_write(pte))
3642 flags |= TNF_NO_GROUP;
3643
3644
3645
3646
3647
3648 if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
3649 flags |= TNF_SHARED;
3650
3651 last_cpupid = page_cpupid_last(page);
3652 page_nid = page_to_nid(page);
3653 target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
3654 &flags);
3655 pte_unmap_unlock(vmf->pte, vmf->ptl);
3656 if (target_nid == -1) {
3657 put_page(page);
3658 goto out;
3659 }
3660
3661
3662 migrated = migrate_misplaced_page(page, vma, target_nid);
3663 if (migrated) {
3664 page_nid = target_nid;
3665 flags |= TNF_MIGRATED;
3666 } else
3667 flags |= TNF_MIGRATE_FAIL;
3668
3669out:
3670 if (page_nid != -1)
3671 task_numa_fault(last_cpupid, page_nid, 1, flags);
3672 return 0;
3673}
3674
3675static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
3676{
3677 if (vma_is_anonymous(vmf->vma))
3678 return do_huge_pmd_anonymous_page(vmf);
3679 if (vmf->vma->vm_ops->huge_fault)
3680 return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
3681 return VM_FAULT_FALLBACK;
3682}
3683
3684
3685static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
3686{
3687 if (vma_is_anonymous(vmf->vma))
3688 return do_huge_pmd_wp_page(vmf, orig_pmd);
3689 if (vmf->vma->vm_ops->huge_fault)
3690 return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
3691
3692
3693 VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);
3694 __split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);
3695
3696 return VM_FAULT_FALLBACK;
3697}
3698
3699static inline bool vma_is_accessible(struct vm_area_struct *vma)
3700{
3701 return vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE);
3702}
3703
3704static vm_fault_t create_huge_pud(struct vm_fault *vmf)
3705{
3706#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3707
3708 if (vma_is_anonymous(vmf->vma))
3709 return VM_FAULT_FALLBACK;
3710 if (vmf->vma->vm_ops->huge_fault)
3711 return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
3712#endif
3713 return VM_FAULT_FALLBACK;
3714}
3715
3716static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
3717{
3718#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3719
3720 if (vma_is_anonymous(vmf->vma))
3721 return VM_FAULT_FALLBACK;
3722 if (vmf->vma->vm_ops->huge_fault)
3723 return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
3724#endif
3725 return VM_FAULT_FALLBACK;
3726}
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
3744{
3745 pte_t entry;
3746
3747 if (unlikely(pmd_none(*vmf->pmd))) {
3748
3749
3750
3751
3752
3753
3754 vmf->pte = NULL;
3755 } else {
3756
3757 if (pmd_devmap_trans_unstable(vmf->pmd))
3758 return 0;
3759
3760
3761
3762
3763
3764
3765 vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
3766 vmf->orig_pte = *vmf->pte;
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776 barrier();
3777 if (pte_none(vmf->orig_pte)) {
3778 pte_unmap(vmf->pte);
3779 vmf->pte = NULL;
3780 }
3781 }
3782
3783 if (!vmf->pte) {
3784 if (vma_is_anonymous(vmf->vma))
3785 return do_anonymous_page(vmf);
3786 else
3787 return do_fault(vmf);
3788 }
3789
3790 if (!pte_present(vmf->orig_pte))
3791 return do_swap_page(vmf);
3792
3793 if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
3794 return do_numa_page(vmf);
3795
3796 vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
3797 spin_lock(vmf->ptl);
3798 entry = vmf->orig_pte;
3799 if (unlikely(!pte_same(*vmf->pte, entry)))
3800 goto unlock;
3801 if (vmf->flags & FAULT_FLAG_WRITE) {
3802 if (!pte_write(entry))
3803 return do_wp_page(vmf);
3804 entry = pte_mkdirty(entry);
3805 }
3806 entry = pte_mkyoung(entry);
3807 if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
3808 vmf->flags & FAULT_FLAG_WRITE)) {
3809 update_mmu_cache(vmf->vma, vmf->address, vmf->pte);
3810 } else {
3811
3812
3813
3814
3815
3816
3817 if (vmf->flags & FAULT_FLAG_WRITE)
3818 flush_tlb_fix_spurious_fault(vmf->vma, vmf->address);
3819 }
3820unlock:
3821 pte_unmap_unlock(vmf->pte, vmf->ptl);
3822 return 0;
3823}
3824
3825
3826
3827
3828
3829
3830
3831static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
3832 unsigned long address, unsigned int flags)
3833{
3834 struct vm_fault vmf = {
3835 .vma = vma,
3836 .address = address & PAGE_MASK,
3837 .flags = flags,
3838 .pgoff = linear_page_index(vma, address),
3839 .gfp_mask = __get_fault_gfp_mask(vma),
3840 };
3841 unsigned int dirty = flags & FAULT_FLAG_WRITE;
3842 struct mm_struct *mm = vma->vm_mm;
3843 pgd_t *pgd;
3844 p4d_t *p4d;
3845 vm_fault_t ret;
3846
3847 pgd = pgd_offset(mm, address);
3848 p4d = p4d_alloc(mm, pgd, address);
3849 if (!p4d)
3850 return VM_FAULT_OOM;
3851
3852 vmf.pud = pud_alloc(mm, p4d, address);
3853 if (!vmf.pud)
3854 return VM_FAULT_OOM;
3855 if (pud_none(*vmf.pud) && __transparent_hugepage_enabled(vma)) {
3856 ret = create_huge_pud(&vmf);
3857 if (!(ret & VM_FAULT_FALLBACK))
3858 return ret;
3859 } else {
3860 pud_t orig_pud = *vmf.pud;
3861
3862 barrier();
3863 if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {
3864
3865
3866
3867 if (dirty && !pud_write(orig_pud)) {
3868 ret = wp_huge_pud(&vmf, orig_pud);
3869 if (!(ret & VM_FAULT_FALLBACK))
3870 return ret;
3871 } else {
3872 huge_pud_set_accessed(&vmf, orig_pud);
3873 return 0;
3874 }
3875 }
3876 }
3877
3878 vmf.pmd = pmd_alloc(mm, vmf.pud, address);
3879 if (!vmf.pmd)
3880 return VM_FAULT_OOM;
3881 if (pmd_none(*vmf.pmd) && __transparent_hugepage_enabled(vma)) {
3882 ret = create_huge_pmd(&vmf);
3883 if (!(ret & VM_FAULT_FALLBACK))
3884 return ret;
3885 } else {
3886 pmd_t orig_pmd = *vmf.pmd;
3887
3888 barrier();
3889 if (unlikely(is_swap_pmd(orig_pmd))) {
3890 VM_BUG_ON(thp_migration_supported() &&
3891 !is_pmd_migration_entry(orig_pmd));
3892 if (is_pmd_migration_entry(orig_pmd))
3893 pmd_migration_entry_wait(mm, vmf.pmd);
3894 return 0;
3895 }
3896 if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
3897 if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
3898 return do_huge_pmd_numa_page(&vmf, orig_pmd);
3899
3900 if (dirty && !pmd_write(orig_pmd)) {
3901 ret = wp_huge_pmd(&vmf, orig_pmd);
3902 if (!(ret & VM_FAULT_FALLBACK))
3903 return ret;
3904 } else {
3905 huge_pmd_set_accessed(&vmf, orig_pmd);
3906 return 0;
3907 }
3908 }
3909 }
3910
3911 return handle_pte_fault(&vmf);
3912}
3913
3914
3915
3916
3917
3918
3919
3920vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
3921 unsigned int flags)
3922{
3923 vm_fault_t ret;
3924
3925 __set_current_state(TASK_RUNNING);
3926
3927 count_vm_event(PGFAULT);
3928 count_memcg_event_mm(vma->vm_mm, PGFAULT);
3929
3930
3931 check_sync_rss_stat(current);
3932
3933 if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
3934 flags & FAULT_FLAG_INSTRUCTION,
3935 flags & FAULT_FLAG_REMOTE))
3936 return VM_FAULT_SIGSEGV;
3937
3938
3939
3940
3941
3942 if (flags & FAULT_FLAG_USER)
3943 mem_cgroup_enter_user_fault();
3944
3945 if (unlikely(is_vm_hugetlb_page(vma)))
3946 ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
3947 else
3948 ret = __handle_mm_fault(vma, address, flags);
3949
3950 if (flags & FAULT_FLAG_USER) {
3951 mem_cgroup_exit_user_fault();
3952
3953
3954
3955
3956
3957
3958 if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
3959 mem_cgroup_oom_synchronize(false);
3960 }
3961
3962 return ret;
3963}
3964EXPORT_SYMBOL_GPL(handle_mm_fault);
3965
3966#ifndef __PAGETABLE_P4D_FOLDED
3967
3968
3969
3970
3971int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
3972{
3973 p4d_t *new = p4d_alloc_one(mm, address);
3974 if (!new)
3975 return -ENOMEM;
3976
3977 smp_wmb();
3978
3979 spin_lock(&mm->page_table_lock);
3980 if (pgd_present(*pgd))
3981 p4d_free(mm, new);
3982 else
3983 pgd_populate(mm, pgd, new);
3984 spin_unlock(&mm->page_table_lock);
3985 return 0;
3986}
3987#endif
3988
3989#ifndef __PAGETABLE_PUD_FOLDED
3990
3991
3992
3993
3994int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address)
3995{
3996 pud_t *new = pud_alloc_one(mm, address);
3997 if (!new)
3998 return -ENOMEM;
3999
4000 smp_wmb();
4001
4002 spin_lock(&mm->page_table_lock);
4003#ifndef __ARCH_HAS_5LEVEL_HACK
4004 if (!p4d_present(*p4d)) {
4005 mm_inc_nr_puds(mm);
4006 p4d_populate(mm, p4d, new);
4007 } else
4008 pud_free(mm, new);
4009#else
4010 if (!pgd_present(*p4d)) {
4011 mm_inc_nr_puds(mm);
4012 pgd_populate(mm, p4d, new);
4013 } else
4014 pud_free(mm, new);
4015#endif
4016 spin_unlock(&mm->page_table_lock);
4017 return 0;
4018}
4019#endif
4020
4021#ifndef __PAGETABLE_PMD_FOLDED
4022
4023
4024
4025
4026int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
4027{
4028 spinlock_t *ptl;
4029 pmd_t *new = pmd_alloc_one(mm, address);
4030 if (!new)
4031 return -ENOMEM;
4032
4033 smp_wmb();
4034
4035 ptl = pud_lock(mm, pud);
4036#ifndef __ARCH_HAS_4LEVEL_HACK
4037 if (!pud_present(*pud)) {
4038 mm_inc_nr_pmds(mm);
4039 pud_populate(mm, pud, new);
4040 } else
4041 pmd_free(mm, new);
4042#else
4043 if (!pgd_present(*pud)) {
4044 mm_inc_nr_pmds(mm);
4045 pgd_populate(mm, pud, new);
4046 } else
4047 pmd_free(mm, new);
4048#endif
4049 spin_unlock(ptl);
4050 return 0;
4051}
4052#endif
4053
4054static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
4055 struct mmu_notifier_range *range,
4056 pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
4057{
4058 pgd_t *pgd;
4059 p4d_t *p4d;
4060 pud_t *pud;
4061 pmd_t *pmd;
4062 pte_t *ptep;
4063
4064 pgd = pgd_offset(mm, address);
4065 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
4066 goto out;
4067
4068 p4d = p4d_offset(pgd, address);
4069 if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d)))
4070 goto out;
4071
4072 pud = pud_offset(p4d, address);
4073 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
4074 goto out;
4075
4076 pmd = pmd_offset(pud, address);
4077 VM_BUG_ON(pmd_trans_huge(*pmd));
4078
4079 if (pmd_huge(*pmd)) {
4080 if (!pmdpp)
4081 goto out;
4082
4083 if (range) {
4084 mmu_notifier_range_init(range, mm, address & PMD_MASK,
4085 (address & PMD_MASK) + PMD_SIZE);
4086 mmu_notifier_invalidate_range_start(range);
4087 }
4088 *ptlp = pmd_lock(mm, pmd);
4089 if (pmd_huge(*pmd)) {
4090 *pmdpp = pmd;
4091 return 0;
4092 }
4093 spin_unlock(*ptlp);
4094 if (range)
4095 mmu_notifier_invalidate_range_end(range);
4096 }
4097
4098 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
4099 goto out;
4100
4101 if (range) {
4102 mmu_notifier_range_init(range, mm, address & PAGE_MASK,
4103 (address & PAGE_MASK) + PAGE_SIZE);
4104 mmu_notifier_invalidate_range_start(range);
4105 }
4106 ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
4107 if (!pte_present(*ptep))
4108 goto unlock;
4109 *ptepp = ptep;
4110 return 0;
4111unlock:
4112 pte_unmap_unlock(ptep, *ptlp);
4113 if (range)
4114 mmu_notifier_invalidate_range_end(range);
4115out:
4116 return -EINVAL;
4117}
4118
4119static inline int follow_pte(struct mm_struct *mm, unsigned long address,
4120 pte_t **ptepp, spinlock_t **ptlp)
4121{
4122 int res;
4123
4124
4125 (void) __cond_lock(*ptlp,
4126 !(res = __follow_pte_pmd(mm, address, NULL,
4127 ptepp, NULL, ptlp)));
4128 return res;
4129}
4130
4131int follow_pte_pmd(struct mm_struct *mm, unsigned long address,
4132 struct mmu_notifier_range *range,
4133 pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
4134{
4135 int res;
4136
4137
4138 (void) __cond_lock(*ptlp,
4139 !(res = __follow_pte_pmd(mm, address, range,
4140 ptepp, pmdpp, ptlp)));
4141 return res;
4142}
4143EXPORT_SYMBOL(follow_pte_pmd);
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155int follow_pfn(struct vm_area_struct *vma, unsigned long address,
4156 unsigned long *pfn)
4157{
4158 int ret = -EINVAL;
4159 spinlock_t *ptl;
4160 pte_t *ptep;
4161
4162 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
4163 return ret;
4164
4165 ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
4166 if (ret)
4167 return ret;
4168 *pfn = pte_pfn(*ptep);
4169 pte_unmap_unlock(ptep, ptl);
4170 return 0;
4171}
4172EXPORT_SYMBOL(follow_pfn);
4173
4174#ifdef CONFIG_HAVE_IOREMAP_PROT
4175int follow_phys(struct vm_area_struct *vma,
4176 unsigned long address, unsigned int flags,
4177 unsigned long *prot, resource_size_t *phys)
4178{
4179 int ret = -EINVAL;
4180 pte_t *ptep, pte;
4181 spinlock_t *ptl;
4182
4183 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
4184 goto out;
4185
4186 if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
4187 goto out;
4188 pte = *ptep;
4189
4190 if ((flags & FOLL_WRITE) && !pte_write(pte))
4191 goto unlock;
4192
4193 *prot = pgprot_val(pte_pgprot(pte));
4194 *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
4195
4196 ret = 0;
4197unlock:
4198 pte_unmap_unlock(ptep, ptl);
4199out:
4200 return ret;
4201}
4202
4203int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
4204 void *buf, int len, int write)
4205{
4206 resource_size_t phys_addr;
4207 unsigned long prot = 0;
4208 void __iomem *maddr;
4209 int offset = addr & (PAGE_SIZE-1);
4210
4211 if (follow_phys(vma, addr, write, &prot, &phys_addr))
4212 return -EINVAL;
4213
4214 maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
4215 if (!maddr)
4216 return -ENOMEM;
4217
4218 if (write)
4219 memcpy_toio(maddr + offset, buf, len);
4220 else
4221 memcpy_fromio(buf, maddr + offset, len);
4222 iounmap(maddr);
4223
4224 return len;
4225}
4226EXPORT_SYMBOL_GPL(generic_access_phys);
4227#endif
4228
4229
4230
4231
4232
4233int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
4234 unsigned long addr, void *buf, int len, unsigned int gup_flags)
4235{
4236 struct vm_area_struct *vma;
4237 void *old_buf = buf;
4238 int write = gup_flags & FOLL_WRITE;
4239
4240 down_read(&mm->mmap_sem);
4241
4242 while (len) {
4243 int bytes, ret, offset;
4244 void *maddr;
4245 struct page *page = NULL;
4246
4247 ret = get_user_pages_remote(tsk, mm, addr, 1,
4248 gup_flags, &page, &vma, NULL);
4249 if (ret <= 0) {
4250#ifndef CONFIG_HAVE_IOREMAP_PROT
4251 break;
4252#else
4253
4254
4255
4256
4257 vma = find_vma(mm, addr);
4258 if (!vma || vma->vm_start > addr)
4259 break;
4260 if (vma->vm_ops && vma->vm_ops->access)
4261 ret = vma->vm_ops->access(vma, addr, buf,
4262 len, write);
4263 if (ret <= 0)
4264 break;
4265 bytes = ret;
4266#endif
4267 } else {
4268 bytes = len;
4269 offset = addr & (PAGE_SIZE-1);
4270 if (bytes > PAGE_SIZE-offset)
4271 bytes = PAGE_SIZE-offset;
4272
4273 maddr = kmap(page);
4274 if (write) {
4275 copy_to_user_page(vma, page, addr,
4276 maddr + offset, buf, bytes);
4277 set_page_dirty_lock(page);
4278 } else {
4279 copy_from_user_page(vma, page, addr,
4280 buf, maddr + offset, bytes);
4281 }
4282 kunmap(page);
4283 put_page(page);
4284 }
4285 len -= bytes;
4286 buf += bytes;
4287 addr += bytes;
4288 }
4289 up_read(&mm->mmap_sem);
4290
4291 return buf - old_buf;
4292}
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304int access_remote_vm(struct mm_struct *mm, unsigned long addr,
4305 void *buf, int len, unsigned int gup_flags)
4306{
4307 return __access_remote_vm(NULL, mm, addr, buf, len, gup_flags);
4308}
4309
4310
4311
4312
4313
4314
4315int access_process_vm(struct task_struct *tsk, unsigned long addr,
4316 void *buf, int len, unsigned int gup_flags)
4317{
4318 struct mm_struct *mm;
4319 int ret;
4320
4321 mm = get_task_mm(tsk);
4322 if (!mm)
4323 return 0;
4324
4325 ret = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags);
4326
4327 mmput(mm);
4328
4329 return ret;
4330}
4331EXPORT_SYMBOL_GPL(access_process_vm);
4332
4333
4334
4335
4336void print_vma_addr(char *prefix, unsigned long ip)
4337{
4338 struct mm_struct *mm = current->mm;
4339 struct vm_area_struct *vma;
4340
4341
4342
4343
4344 if (!down_read_trylock(&mm->mmap_sem))
4345 return;
4346
4347 vma = find_vma(mm, ip);
4348 if (vma && vma->vm_file) {
4349 struct file *f = vma->vm_file;
4350 char *buf = (char *)__get_free_page(GFP_NOWAIT);
4351 if (buf) {
4352 char *p;
4353
4354 p = file_path(f, buf, PAGE_SIZE);
4355 if (IS_ERR(p))
4356 p = "?";
4357 printk("%s%s[%lx+%lx]", prefix, kbasename(p),
4358 vma->vm_start,
4359 vma->vm_end - vma->vm_start);
4360 free_page((unsigned long)buf);
4361 }
4362 }
4363 up_read(&mm->mmap_sem);
4364}
4365
4366#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
4367void __might_fault(const char *file, int line)
4368{
4369
4370
4371
4372
4373
4374
4375 if (uaccess_kernel())
4376 return;
4377 if (pagefault_disabled())
4378 return;
4379 __might_sleep(file, line, 0);
4380#if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
4381 if (current->mm)
4382 might_lock_read(¤t->mm->mmap_sem);
4383#endif
4384}
4385EXPORT_SYMBOL(__might_fault);
4386#endif
4387
4388#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
4389
4390
4391
4392
4393
4394static inline void process_huge_page(
4395 unsigned long addr_hint, unsigned int pages_per_huge_page,
4396 void (*process_subpage)(unsigned long addr, int idx, void *arg),
4397 void *arg)
4398{
4399 int i, n, base, l;
4400 unsigned long addr = addr_hint &
4401 ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
4402
4403
4404 might_sleep();
4405 n = (addr_hint - addr) / PAGE_SIZE;
4406 if (2 * n <= pages_per_huge_page) {
4407
4408 base = 0;
4409 l = n;
4410
4411 for (i = pages_per_huge_page - 1; i >= 2 * n; i--) {
4412 cond_resched();
4413 process_subpage(addr + i * PAGE_SIZE, i, arg);
4414 }
4415 } else {
4416
4417 base = pages_per_huge_page - 2 * (pages_per_huge_page - n);
4418 l = pages_per_huge_page - n;
4419
4420 for (i = 0; i < base; i++) {
4421 cond_resched();
4422 process_subpage(addr + i * PAGE_SIZE, i, arg);
4423 }
4424 }
4425
4426
4427
4428
4429 for (i = 0; i < l; i++) {
4430 int left_idx = base + i;
4431 int right_idx = base + 2 * l - 1 - i;
4432
4433 cond_resched();
4434 process_subpage(addr + left_idx * PAGE_SIZE, left_idx, arg);
4435 cond_resched();
4436 process_subpage(addr + right_idx * PAGE_SIZE, right_idx, arg);
4437 }
4438}
4439
4440static void clear_gigantic_page(struct page *page,
4441 unsigned long addr,
4442 unsigned int pages_per_huge_page)
4443{
4444 int i;
4445 struct page *p = page;
4446
4447 might_sleep();
4448 for (i = 0; i < pages_per_huge_page;
4449 i++, p = mem_map_next(p, page, i)) {
4450 cond_resched();
4451 clear_user_highpage(p, addr + i * PAGE_SIZE);
4452 }
4453}
4454
4455static void clear_subpage(unsigned long addr, int idx, void *arg)
4456{
4457 struct page *page = arg;
4458
4459 clear_user_highpage(page + idx, addr);
4460}
4461
4462void clear_huge_page(struct page *page,
4463 unsigned long addr_hint, unsigned int pages_per_huge_page)
4464{
4465 unsigned long addr = addr_hint &
4466 ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
4467
4468 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
4469 clear_gigantic_page(page, addr, pages_per_huge_page);
4470 return;
4471 }
4472
4473 process_huge_page(addr_hint, pages_per_huge_page, clear_subpage, page);
4474}
4475
4476static void copy_user_gigantic_page(struct page *dst, struct page *src,
4477 unsigned long addr,
4478 struct vm_area_struct *vma,
4479 unsigned int pages_per_huge_page)
4480{
4481 int i;
4482 struct page *dst_base = dst;
4483 struct page *src_base = src;
4484
4485 for (i = 0; i < pages_per_huge_page; ) {
4486 cond_resched();
4487 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
4488
4489 i++;
4490 dst = mem_map_next(dst, dst_base, i);
4491 src = mem_map_next(src, src_base, i);
4492 }
4493}
4494
4495struct copy_subpage_arg {
4496 struct page *dst;
4497 struct page *src;
4498 struct vm_area_struct *vma;
4499};
4500
4501static void copy_subpage(unsigned long addr, int idx, void *arg)
4502{
4503 struct copy_subpage_arg *copy_arg = arg;
4504
4505 copy_user_highpage(copy_arg->dst + idx, copy_arg->src + idx,
4506 addr, copy_arg->vma);
4507}
4508
4509void copy_user_huge_page(struct page *dst, struct page *src,
4510 unsigned long addr_hint, struct vm_area_struct *vma,
4511 unsigned int pages_per_huge_page)
4512{
4513 unsigned long addr = addr_hint &
4514 ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
4515 struct copy_subpage_arg arg = {
4516 .dst = dst,
4517 .src = src,
4518 .vma = vma,
4519 };
4520
4521 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
4522 copy_user_gigantic_page(dst, src, addr, vma,
4523 pages_per_huge_page);
4524 return;
4525 }
4526
4527 process_huge_page(addr_hint, pages_per_huge_page, copy_subpage, &arg);
4528}
4529
4530long copy_huge_page_from_user(struct page *dst_page,
4531 const void __user *usr_src,
4532 unsigned int pages_per_huge_page,
4533 bool allow_pagefault)
4534{
4535 void *src = (void *)usr_src;
4536 void *page_kaddr;
4537 unsigned long i, rc = 0;
4538 unsigned long ret_val = pages_per_huge_page * PAGE_SIZE;
4539
4540 for (i = 0; i < pages_per_huge_page; i++) {
4541 if (allow_pagefault)
4542 page_kaddr = kmap(dst_page + i);
4543 else
4544 page_kaddr = kmap_atomic(dst_page + i);
4545 rc = copy_from_user(page_kaddr,
4546 (const void __user *)(src + i * PAGE_SIZE),
4547 PAGE_SIZE);
4548 if (allow_pagefault)
4549 kunmap(dst_page + i);
4550 else
4551 kunmap_atomic(page_kaddr);
4552
4553 ret_val -= (PAGE_SIZE - rc);
4554 if (rc)
4555 break;
4556
4557 cond_resched();
4558 }
4559 return ret_val;
4560}
4561#endif
4562
4563#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS
4564
4565static struct kmem_cache *page_ptl_cachep;
4566
4567void __init ptlock_cache_init(void)
4568{
4569 page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0,
4570 SLAB_PANIC, NULL);
4571}
4572
4573bool ptlock_alloc(struct page *page)
4574{
4575 spinlock_t *ptl;
4576
4577 ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
4578 if (!ptl)
4579 return false;
4580 page->ptl = ptl;
4581 return true;
4582}
4583
4584void ptlock_free(struct page *page)
4585{
4586 kmem_cache_free(page_ptl_cachep, page->ptl);
4587}
4588#endif
4589