1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41#include <linux/kernel_stat.h>
42#include <linux/mm.h>
43#include <linux/hugetlb.h>
44#include <linux/mman.h>
45#include <linux/swap.h>
46#include <linux/highmem.h>
47#include <linux/pagemap.h>
48#include <linux/ksm.h>
49#include <linux/rmap.h>
50#include <linux/export.h>
51#include <linux/delayacct.h>
52#include <linux/init.h>
53#include <linux/pfn_t.h>
54#include <linux/writeback.h>
55#include <linux/memcontrol.h>
56#include <linux/mmu_notifier.h>
57#include <linux/kallsyms.h>
58#include <linux/swapops.h>
59#include <linux/elf.h>
60#include <linux/gfp.h>
61#include <linux/migrate.h>
62#include <linux/string.h>
63#include <linux/dma-debug.h>
64#include <linux/debugfs.h>
65#include <linux/userfaultfd_k.h>
66#include <linux/dax.h>
67
68#include <asm/io.h>
69#include <asm/mmu_context.h>
70#include <asm/pgalloc.h>
71#include <asm/uaccess.h>
72#include <asm/tlb.h>
73#include <asm/tlbflush.h>
74#include <asm/pgtable.h>
75
76#include "internal.h"
77
78#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
79#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
80#endif
81
82#ifndef CONFIG_NEED_MULTIPLE_NODES
83
84unsigned long max_mapnr;
85struct page *mem_map;
86
87EXPORT_SYMBOL(max_mapnr);
88EXPORT_SYMBOL(mem_map);
89#endif
90
91
92
93
94
95
96
97
98void * high_memory;
99
100EXPORT_SYMBOL(high_memory);
101
102
103
104
105
106
107
108int randomize_va_space __read_mostly =
109#ifdef CONFIG_COMPAT_BRK
110 1;
111#else
112 2;
113#endif
114
115static int __init disable_randmaps(char *s)
116{
117 randomize_va_space = 0;
118 return 1;
119}
120__setup("norandmaps", disable_randmaps);
121
122unsigned long zero_pfn __read_mostly;
123unsigned long highest_memmap_pfn __read_mostly;
124
125EXPORT_SYMBOL(zero_pfn);
126
127
128
129
130static int __init init_zero_pfn(void)
131{
132 zero_pfn = page_to_pfn(ZERO_PAGE(0));
133 return 0;
134}
135core_initcall(init_zero_pfn);
136
137
138#if defined(SPLIT_RSS_COUNTING)
139
140void sync_mm_rss(struct mm_struct *mm)
141{
142 int i;
143
144 for (i = 0; i < NR_MM_COUNTERS; i++) {
145 if (current->rss_stat.count[i]) {
146 add_mm_counter(mm, i, current->rss_stat.count[i]);
147 current->rss_stat.count[i] = 0;
148 }
149 }
150 current->rss_stat.events = 0;
151}
152
153static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
154{
155 struct task_struct *task = current;
156
157 if (likely(task->mm == mm))
158 task->rss_stat.count[member] += val;
159 else
160 add_mm_counter(mm, member, val);
161}
162#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
163#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
164
165
166#define TASK_RSS_EVENTS_THRESH (64)
167static void check_sync_rss_stat(struct task_struct *task)
168{
169 if (unlikely(task != current))
170 return;
171 if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
172 sync_mm_rss(task->mm);
173}
174#else
175
176#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
177#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
178
179static void check_sync_rss_stat(struct task_struct *task)
180{
181}
182
183#endif
184
185#ifdef HAVE_GENERIC_MMU_GATHER
186
187static bool tlb_next_batch(struct mmu_gather *tlb)
188{
189 struct mmu_gather_batch *batch;
190
191 batch = tlb->active;
192 if (batch->next) {
193 tlb->active = batch->next;
194 return true;
195 }
196
197 if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
198 return false;
199
200 batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
201 if (!batch)
202 return false;
203
204 tlb->batch_count++;
205 batch->next = NULL;
206 batch->nr = 0;
207 batch->max = MAX_GATHER_BATCH;
208
209 tlb->active->next = batch;
210 tlb->active = batch;
211
212 return true;
213}
214
215
216
217
218
219
220void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end)
221{
222 tlb->mm = mm;
223
224
225 tlb->fullmm = !(start | (end+1));
226 tlb->need_flush_all = 0;
227 tlb->local.next = NULL;
228 tlb->local.nr = 0;
229 tlb->local.max = ARRAY_SIZE(tlb->__pages);
230 tlb->active = &tlb->local;
231 tlb->batch_count = 0;
232
233#ifdef CONFIG_HAVE_RCU_TABLE_FREE
234 tlb->batch = NULL;
235#endif
236 tlb->page_size = 0;
237
238 __tlb_reset_range(tlb);
239}
240
241static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
242{
243 if (!tlb->end)
244 return;
245
246 tlb_flush(tlb);
247 mmu_notifier_invalidate_range(tlb->mm, tlb->start, tlb->end);
248#ifdef CONFIG_HAVE_RCU_TABLE_FREE
249 tlb_table_flush(tlb);
250#endif
251 __tlb_reset_range(tlb);
252}
253
254static void tlb_flush_mmu_free(struct mmu_gather *tlb)
255{
256 struct mmu_gather_batch *batch;
257
258 for (batch = &tlb->local; batch && batch->nr; batch = batch->next) {
259 free_pages_and_swap_cache(batch->pages, batch->nr);
260 batch->nr = 0;
261 }
262 tlb->active = &tlb->local;
263}
264
265void tlb_flush_mmu(struct mmu_gather *tlb)
266{
267 tlb_flush_mmu_tlbonly(tlb);
268 tlb_flush_mmu_free(tlb);
269}
270
271
272
273
274
275void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
276{
277 struct mmu_gather_batch *batch, *next;
278
279 tlb_flush_mmu(tlb);
280
281
282 check_pgt_cache();
283
284 for (batch = tlb->local.next; batch; batch = next) {
285 next = batch->next;
286 free_pages((unsigned long)batch, 0);
287 }
288 tlb->local.next = NULL;
289}
290
291
292
293
294
295
296
297
298bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size)
299{
300 struct mmu_gather_batch *batch;
301
302 VM_BUG_ON(!tlb->end);
303
304 if (!tlb->page_size)
305 tlb->page_size = page_size;
306 else {
307 if (page_size != tlb->page_size)
308 return true;
309 }
310
311 batch = tlb->active;
312 if (batch->nr == batch->max) {
313 if (!tlb_next_batch(tlb))
314 return true;
315 batch = tlb->active;
316 }
317 VM_BUG_ON_PAGE(batch->nr > batch->max, page);
318
319 batch->pages[batch->nr++] = page;
320 return false;
321}
322
323#endif
324
325#ifdef CONFIG_HAVE_RCU_TABLE_FREE
326
327
328
329
330
331static void tlb_remove_table_smp_sync(void *arg)
332{
333
334}
335
336static void tlb_remove_table_one(void *table)
337{
338
339
340
341
342
343
344
345 smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
346 __tlb_remove_table(table);
347}
348
349static void tlb_remove_table_rcu(struct rcu_head *head)
350{
351 struct mmu_table_batch *batch;
352 int i;
353
354 batch = container_of(head, struct mmu_table_batch, rcu);
355
356 for (i = 0; i < batch->nr; i++)
357 __tlb_remove_table(batch->tables[i]);
358
359 free_page((unsigned long)batch);
360}
361
362void tlb_table_flush(struct mmu_gather *tlb)
363{
364 struct mmu_table_batch **batch = &tlb->batch;
365
366 if (*batch) {
367 call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
368 *batch = NULL;
369 }
370}
371
372void tlb_remove_table(struct mmu_gather *tlb, void *table)
373{
374 struct mmu_table_batch **batch = &tlb->batch;
375
376
377
378
379
380 if (atomic_read(&tlb->mm->mm_users) < 2) {
381 __tlb_remove_table(table);
382 return;
383 }
384
385 if (*batch == NULL) {
386 *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
387 if (*batch == NULL) {
388 tlb_remove_table_one(table);
389 return;
390 }
391 (*batch)->nr = 0;
392 }
393 (*batch)->tables[(*batch)->nr++] = table;
394 if ((*batch)->nr == MAX_TABLE_BATCH)
395 tlb_table_flush(tlb);
396}
397
398#endif
399
400
401
402
403
404static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
405 unsigned long addr)
406{
407 pgtable_t token = pmd_pgtable(*pmd);
408 pmd_clear(pmd);
409 pte_free_tlb(tlb, token, addr);
410 atomic_long_dec(&tlb->mm->nr_ptes);
411}
412
413static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
414 unsigned long addr, unsigned long end,
415 unsigned long floor, unsigned long ceiling)
416{
417 pmd_t *pmd;
418 unsigned long next;
419 unsigned long start;
420
421 start = addr;
422 pmd = pmd_offset(pud, addr);
423 do {
424 next = pmd_addr_end(addr, end);
425 if (pmd_none_or_clear_bad(pmd))
426 continue;
427 free_pte_range(tlb, pmd, addr);
428 } while (pmd++, addr = next, addr != end);
429
430 start &= PUD_MASK;
431 if (start < floor)
432 return;
433 if (ceiling) {
434 ceiling &= PUD_MASK;
435 if (!ceiling)
436 return;
437 }
438 if (end - 1 > ceiling - 1)
439 return;
440
441 pmd = pmd_offset(pud, start);
442 pud_clear(pud);
443 pmd_free_tlb(tlb, pmd, start);
444 mm_dec_nr_pmds(tlb->mm);
445}
446
447static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
448 unsigned long addr, unsigned long end,
449 unsigned long floor, unsigned long ceiling)
450{
451 pud_t *pud;
452 unsigned long next;
453 unsigned long start;
454
455 start = addr;
456 pud = pud_offset(pgd, addr);
457 do {
458 next = pud_addr_end(addr, end);
459 if (pud_none_or_clear_bad(pud))
460 continue;
461 free_pmd_range(tlb, pud, addr, next, floor, ceiling);
462 } while (pud++, addr = next, addr != end);
463
464 start &= PGDIR_MASK;
465 if (start < floor)
466 return;
467 if (ceiling) {
468 ceiling &= PGDIR_MASK;
469 if (!ceiling)
470 return;
471 }
472 if (end - 1 > ceiling - 1)
473 return;
474
475 pud = pud_offset(pgd, start);
476 pgd_clear(pgd);
477 pud_free_tlb(tlb, pud, start);
478}
479
480
481
482
483void free_pgd_range(struct mmu_gather *tlb,
484 unsigned long addr, unsigned long end,
485 unsigned long floor, unsigned long ceiling)
486{
487 pgd_t *pgd;
488 unsigned long next;
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516 addr &= PMD_MASK;
517 if (addr < floor) {
518 addr += PMD_SIZE;
519 if (!addr)
520 return;
521 }
522 if (ceiling) {
523 ceiling &= PMD_MASK;
524 if (!ceiling)
525 return;
526 }
527 if (end - 1 > ceiling - 1)
528 end -= PMD_SIZE;
529 if (addr > end - 1)
530 return;
531
532 pgd = pgd_offset(tlb->mm, addr);
533 do {
534 next = pgd_addr_end(addr, end);
535 if (pgd_none_or_clear_bad(pgd))
536 continue;
537 free_pud_range(tlb, pgd, addr, next, floor, ceiling);
538 } while (pgd++, addr = next, addr != end);
539}
540
541void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
542 unsigned long floor, unsigned long ceiling)
543{
544 while (vma) {
545 struct vm_area_struct *next = vma->vm_next;
546 unsigned long addr = vma->vm_start;
547
548
549
550
551
552 unlink_anon_vmas(vma);
553 unlink_file_vma(vma);
554
555 if (is_vm_hugetlb_page(vma)) {
556 hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
557 floor, next? next->vm_start: ceiling);
558 } else {
559
560
561
562 while (next && next->vm_start <= vma->vm_end + PMD_SIZE
563 && !is_vm_hugetlb_page(next)) {
564 vma = next;
565 next = vma->vm_next;
566 unlink_anon_vmas(vma);
567 unlink_file_vma(vma);
568 }
569 free_pgd_range(tlb, addr, vma->vm_end,
570 floor, next? next->vm_start: ceiling);
571 }
572 vma = next;
573 }
574}
575
576int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
577{
578 spinlock_t *ptl;
579 pgtable_t new = pte_alloc_one(mm, address);
580 if (!new)
581 return -ENOMEM;
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596 smp_wmb();
597
598 ptl = pmd_lock(mm, pmd);
599 if (likely(pmd_none(*pmd))) {
600 atomic_long_inc(&mm->nr_ptes);
601 pmd_populate(mm, pmd, new);
602 new = NULL;
603 }
604 spin_unlock(ptl);
605 if (new)
606 pte_free(mm, new);
607 return 0;
608}
609
610int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
611{
612 pte_t *new = pte_alloc_one_kernel(&init_mm, address);
613 if (!new)
614 return -ENOMEM;
615
616 smp_wmb();
617
618 spin_lock(&init_mm.page_table_lock);
619 if (likely(pmd_none(*pmd))) {
620 pmd_populate_kernel(&init_mm, pmd, new);
621 new = NULL;
622 }
623 spin_unlock(&init_mm.page_table_lock);
624 if (new)
625 pte_free_kernel(&init_mm, new);
626 return 0;
627}
628
629static inline void init_rss_vec(int *rss)
630{
631 memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
632}
633
634static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
635{
636 int i;
637
638 if (current->mm == mm)
639 sync_mm_rss(mm);
640 for (i = 0; i < NR_MM_COUNTERS; i++)
641 if (rss[i])
642 add_mm_counter(mm, i, rss[i]);
643}
644
645
646
647
648
649
650
651
652static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
653 pte_t pte, struct page *page)
654{
655 pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
656 pud_t *pud = pud_offset(pgd, addr);
657 pmd_t *pmd = pmd_offset(pud, addr);
658 struct address_space *mapping;
659 pgoff_t index;
660 static unsigned long resume;
661 static unsigned long nr_shown;
662 static unsigned long nr_unshown;
663
664
665
666
667
668 if (nr_shown == 60) {
669 if (time_before(jiffies, resume)) {
670 nr_unshown++;
671 return;
672 }
673 if (nr_unshown) {
674 pr_alert("BUG: Bad page map: %lu messages suppressed\n",
675 nr_unshown);
676 nr_unshown = 0;
677 }
678 nr_shown = 0;
679 }
680 if (nr_shown++ == 0)
681 resume = jiffies + 60 * HZ;
682
683 mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
684 index = linear_page_index(vma, addr);
685
686 pr_alert("BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
687 current->comm,
688 (long long)pte_val(pte), (long long)pmd_val(*pmd));
689 if (page)
690 dump_page(page, "bad pte");
691 pr_alert("addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
692 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
693
694
695
696 pr_alert("file:%pD fault:%pf mmap:%pf readpage:%pf\n",
697 vma->vm_file,
698 vma->vm_ops ? vma->vm_ops->fault : NULL,
699 vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
700 mapping ? mapping->a_ops->readpage : NULL);
701 dump_stack();
702 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
703}
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747#ifdef __HAVE_ARCH_PTE_SPECIAL
748# define HAVE_PTE_SPECIAL 1
749#else
750# define HAVE_PTE_SPECIAL 0
751#endif
752struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
753 pte_t pte)
754{
755 unsigned long pfn = pte_pfn(pte);
756
757 if (HAVE_PTE_SPECIAL) {
758 if (likely(!pte_special(pte)))
759 goto check_pfn;
760 if (vma->vm_ops && vma->vm_ops->find_special_page)
761 return vma->vm_ops->find_special_page(vma, addr);
762 if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
763 return NULL;
764 if (!is_zero_pfn(pfn))
765 print_bad_pte(vma, addr, pte, NULL);
766 return NULL;
767 }
768
769
770
771 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
772 if (vma->vm_flags & VM_MIXEDMAP) {
773 if (!pfn_valid(pfn))
774 return NULL;
775 goto out;
776 } else {
777 unsigned long off;
778 off = (addr - vma->vm_start) >> PAGE_SHIFT;
779 if (pfn == vma->vm_pgoff + off)
780 return NULL;
781 if (!is_cow_mapping(vma->vm_flags))
782 return NULL;
783 }
784 }
785
786 if (is_zero_pfn(pfn))
787 return NULL;
788check_pfn:
789 if (unlikely(pfn > highest_memmap_pfn)) {
790 print_bad_pte(vma, addr, pte, NULL);
791 return NULL;
792 }
793
794
795
796
797
798out:
799 return pfn_to_page(pfn);
800}
801
802#ifdef CONFIG_TRANSPARENT_HUGEPAGE
803struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
804 pmd_t pmd)
805{
806 unsigned long pfn = pmd_pfn(pmd);
807
808
809
810
811
812
813 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
814 if (vma->vm_flags & VM_MIXEDMAP) {
815 if (!pfn_valid(pfn))
816 return NULL;
817 goto out;
818 } else {
819 unsigned long off;
820 off = (addr - vma->vm_start) >> PAGE_SHIFT;
821 if (pfn == vma->vm_pgoff + off)
822 return NULL;
823 if (!is_cow_mapping(vma->vm_flags))
824 return NULL;
825 }
826 }
827
828 if (is_zero_pfn(pfn))
829 return NULL;
830 if (unlikely(pfn > highest_memmap_pfn))
831 return NULL;
832
833
834
835
836
837out:
838 return pfn_to_page(pfn);
839}
840#endif
841
842
843
844
845
846
847
848static inline unsigned long
849copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
850 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
851 unsigned long addr, int *rss)
852{
853 unsigned long vm_flags = vma->vm_flags;
854 pte_t pte = *src_pte;
855 struct page *page;
856
857
858 if (unlikely(!pte_present(pte))) {
859 swp_entry_t entry = pte_to_swp_entry(pte);
860
861 if (likely(!non_swap_entry(entry))) {
862 if (swap_duplicate(entry) < 0)
863 return entry.val;
864
865
866 if (unlikely(list_empty(&dst_mm->mmlist))) {
867 spin_lock(&mmlist_lock);
868 if (list_empty(&dst_mm->mmlist))
869 list_add(&dst_mm->mmlist,
870 &src_mm->mmlist);
871 spin_unlock(&mmlist_lock);
872 }
873 rss[MM_SWAPENTS]++;
874 } else if (is_migration_entry(entry)) {
875 page = migration_entry_to_page(entry);
876
877 rss[mm_counter(page)]++;
878
879 if (is_write_migration_entry(entry) &&
880 is_cow_mapping(vm_flags)) {
881
882
883
884
885 make_migration_entry_read(&entry);
886 pte = swp_entry_to_pte(entry);
887 if (pte_swp_soft_dirty(*src_pte))
888 pte = pte_swp_mksoft_dirty(pte);
889 set_pte_at(src_mm, addr, src_pte, pte);
890 }
891 }
892 goto out_set_pte;
893 }
894
895
896
897
898
899 if (is_cow_mapping(vm_flags)) {
900 ptep_set_wrprotect(src_mm, addr, src_pte);
901 pte = pte_wrprotect(pte);
902 }
903
904
905
906
907
908 if (vm_flags & VM_SHARED)
909 pte = pte_mkclean(pte);
910 pte = pte_mkold(pte);
911
912 page = vm_normal_page(vma, addr, pte);
913 if (page) {
914 get_page(page);
915 page_dup_rmap(page, false);
916 rss[mm_counter(page)]++;
917 }
918
919out_set_pte:
920 set_pte_at(dst_mm, addr, dst_pte, pte);
921 return 0;
922}
923
924static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
925 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
926 unsigned long addr, unsigned long end)
927{
928 pte_t *orig_src_pte, *orig_dst_pte;
929 pte_t *src_pte, *dst_pte;
930 spinlock_t *src_ptl, *dst_ptl;
931 int progress = 0;
932 int rss[NR_MM_COUNTERS];
933 swp_entry_t entry = (swp_entry_t){0};
934
935again:
936 init_rss_vec(rss);
937
938 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
939 if (!dst_pte)
940 return -ENOMEM;
941 src_pte = pte_offset_map(src_pmd, addr);
942 src_ptl = pte_lockptr(src_mm, src_pmd);
943 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
944 orig_src_pte = src_pte;
945 orig_dst_pte = dst_pte;
946 arch_enter_lazy_mmu_mode();
947
948 do {
949
950
951
952
953 if (progress >= 32) {
954 progress = 0;
955 if (need_resched() ||
956 spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
957 break;
958 }
959 if (pte_none(*src_pte)) {
960 progress++;
961 continue;
962 }
963 entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
964 vma, addr, rss);
965 if (entry.val)
966 break;
967 progress += 8;
968 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
969
970 arch_leave_lazy_mmu_mode();
971 spin_unlock(src_ptl);
972 pte_unmap(orig_src_pte);
973 add_mm_rss_vec(dst_mm, rss);
974 pte_unmap_unlock(orig_dst_pte, dst_ptl);
975 cond_resched();
976
977 if (entry.val) {
978 if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
979 return -ENOMEM;
980 progress = 0;
981 }
982 if (addr != end)
983 goto again;
984 return 0;
985}
986
987static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
988 pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
989 unsigned long addr, unsigned long end)
990{
991 pmd_t *src_pmd, *dst_pmd;
992 unsigned long next;
993
994 dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
995 if (!dst_pmd)
996 return -ENOMEM;
997 src_pmd = pmd_offset(src_pud, addr);
998 do {
999 next = pmd_addr_end(addr, end);
1000 if (pmd_trans_huge(*src_pmd) || pmd_devmap(*src_pmd)) {
1001 int err;
1002 VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);
1003 err = copy_huge_pmd(dst_mm, src_mm,
1004 dst_pmd, src_pmd, addr, vma);
1005 if (err == -ENOMEM)
1006 return -ENOMEM;
1007 if (!err)
1008 continue;
1009
1010 }
1011 if (pmd_none_or_clear_bad(src_pmd))
1012 continue;
1013 if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
1014 vma, addr, next))
1015 return -ENOMEM;
1016 } while (dst_pmd++, src_pmd++, addr = next, addr != end);
1017 return 0;
1018}
1019
1020static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1021 pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
1022 unsigned long addr, unsigned long end)
1023{
1024 pud_t *src_pud, *dst_pud;
1025 unsigned long next;
1026
1027 dst_pud = pud_alloc(dst_mm, dst_pgd, addr);
1028 if (!dst_pud)
1029 return -ENOMEM;
1030 src_pud = pud_offset(src_pgd, addr);
1031 do {
1032 next = pud_addr_end(addr, end);
1033 if (pud_none_or_clear_bad(src_pud))
1034 continue;
1035 if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
1036 vma, addr, next))
1037 return -ENOMEM;
1038 } while (dst_pud++, src_pud++, addr = next, addr != end);
1039 return 0;
1040}
1041
1042int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1043 struct vm_area_struct *vma)
1044{
1045 pgd_t *src_pgd, *dst_pgd;
1046 unsigned long next;
1047 unsigned long addr = vma->vm_start;
1048 unsigned long end = vma->vm_end;
1049 unsigned long mmun_start;
1050 unsigned long mmun_end;
1051 bool is_cow;
1052 int ret;
1053
1054
1055
1056
1057
1058
1059
1060 if (!(vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) &&
1061 !vma->anon_vma)
1062 return 0;
1063
1064 if (is_vm_hugetlb_page(vma))
1065 return copy_hugetlb_page_range(dst_mm, src_mm, vma);
1066
1067 if (unlikely(vma->vm_flags & VM_PFNMAP)) {
1068
1069
1070
1071
1072 ret = track_pfn_copy(vma);
1073 if (ret)
1074 return ret;
1075 }
1076
1077
1078
1079
1080
1081
1082
1083 is_cow = is_cow_mapping(vma->vm_flags);
1084 mmun_start = addr;
1085 mmun_end = end;
1086 if (is_cow)
1087 mmu_notifier_invalidate_range_start(src_mm, mmun_start,
1088 mmun_end);
1089
1090 ret = 0;
1091 dst_pgd = pgd_offset(dst_mm, addr);
1092 src_pgd = pgd_offset(src_mm, addr);
1093 do {
1094 next = pgd_addr_end(addr, end);
1095 if (pgd_none_or_clear_bad(src_pgd))
1096 continue;
1097 if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
1098 vma, addr, next))) {
1099 ret = -ENOMEM;
1100 break;
1101 }
1102 } while (dst_pgd++, src_pgd++, addr = next, addr != end);
1103
1104 if (is_cow)
1105 mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end);
1106 return ret;
1107}
1108
1109static unsigned long zap_pte_range(struct mmu_gather *tlb,
1110 struct vm_area_struct *vma, pmd_t *pmd,
1111 unsigned long addr, unsigned long end,
1112 struct zap_details *details)
1113{
1114 struct mm_struct *mm = tlb->mm;
1115 int force_flush = 0;
1116 int rss[NR_MM_COUNTERS];
1117 spinlock_t *ptl;
1118 pte_t *start_pte;
1119 pte_t *pte;
1120 swp_entry_t entry;
1121 struct page *pending_page = NULL;
1122
1123again:
1124 init_rss_vec(rss);
1125 start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
1126 pte = start_pte;
1127 arch_enter_lazy_mmu_mode();
1128 do {
1129 pte_t ptent = *pte;
1130 if (pte_none(ptent)) {
1131 continue;
1132 }
1133
1134 if (pte_present(ptent)) {
1135 struct page *page;
1136
1137 page = vm_normal_page(vma, addr, ptent);
1138 if (unlikely(details) && page) {
1139
1140
1141
1142
1143
1144 if (details->check_mapping &&
1145 details->check_mapping != page_rmapping(page))
1146 continue;
1147 }
1148 ptent = ptep_get_and_clear_full(mm, addr, pte,
1149 tlb->fullmm);
1150 tlb_remove_tlb_entry(tlb, pte, addr);
1151 if (unlikely(!page))
1152 continue;
1153
1154 if (!PageAnon(page)) {
1155 if (pte_dirty(ptent)) {
1156
1157
1158
1159
1160 if (unlikely(details && details->ignore_dirty))
1161 continue;
1162 force_flush = 1;
1163 set_page_dirty(page);
1164 }
1165 if (pte_young(ptent) &&
1166 likely(!(vma->vm_flags & VM_SEQ_READ)))
1167 mark_page_accessed(page);
1168 }
1169 rss[mm_counter(page)]--;
1170 page_remove_rmap(page, false);
1171 if (unlikely(page_mapcount(page) < 0))
1172 print_bad_pte(vma, addr, ptent, page);
1173 if (unlikely(__tlb_remove_page(tlb, page))) {
1174 force_flush = 1;
1175 pending_page = page;
1176 addr += PAGE_SIZE;
1177 break;
1178 }
1179 continue;
1180 }
1181
1182 if (unlikely(details && !details->check_swap_entries))
1183 continue;
1184
1185 entry = pte_to_swp_entry(ptent);
1186 if (!non_swap_entry(entry))
1187 rss[MM_SWAPENTS]--;
1188 else if (is_migration_entry(entry)) {
1189 struct page *page;
1190
1191 page = migration_entry_to_page(entry);
1192 rss[mm_counter(page)]--;
1193 }
1194 if (unlikely(!free_swap_and_cache(entry)))
1195 print_bad_pte(vma, addr, ptent, NULL);
1196 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
1197 } while (pte++, addr += PAGE_SIZE, addr != end);
1198
1199 add_mm_rss_vec(mm, rss);
1200 arch_leave_lazy_mmu_mode();
1201
1202
1203 if (force_flush)
1204 tlb_flush_mmu_tlbonly(tlb);
1205 pte_unmap_unlock(start_pte, ptl);
1206
1207
1208
1209
1210
1211
1212
1213 if (force_flush) {
1214 force_flush = 0;
1215 tlb_flush_mmu_free(tlb);
1216 if (pending_page) {
1217
1218 __tlb_remove_pte_page(tlb, pending_page);
1219 pending_page = NULL;
1220 }
1221 if (addr != end)
1222 goto again;
1223 }
1224
1225 return addr;
1226}
1227
1228static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1229 struct vm_area_struct *vma, pud_t *pud,
1230 unsigned long addr, unsigned long end,
1231 struct zap_details *details)
1232{
1233 pmd_t *pmd;
1234 unsigned long next;
1235
1236 pmd = pmd_offset(pud, addr);
1237 do {
1238 next = pmd_addr_end(addr, end);
1239 if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
1240 if (next - addr != HPAGE_PMD_SIZE) {
1241 VM_BUG_ON_VMA(vma_is_anonymous(vma) &&
1242 !rwsem_is_locked(&tlb->mm->mmap_sem), vma);
1243 split_huge_pmd(vma, pmd, addr);
1244 } else if (zap_huge_pmd(tlb, vma, pmd, addr))
1245 goto next;
1246
1247 }
1248
1249
1250
1251
1252
1253
1254
1255 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
1256 goto next;
1257 next = zap_pte_range(tlb, vma, pmd, addr, next, details);
1258next:
1259 cond_resched();
1260 } while (pmd++, addr = next, addr != end);
1261
1262 return addr;
1263}
1264
1265static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1266 struct vm_area_struct *vma, pgd_t *pgd,
1267 unsigned long addr, unsigned long end,
1268 struct zap_details *details)
1269{
1270 pud_t *pud;
1271 unsigned long next;
1272
1273 pud = pud_offset(pgd, addr);
1274 do {
1275 next = pud_addr_end(addr, end);
1276 if (pud_none_or_clear_bad(pud))
1277 continue;
1278 next = zap_pmd_range(tlb, vma, pud, addr, next, details);
1279 } while (pud++, addr = next, addr != end);
1280
1281 return addr;
1282}
1283
1284void unmap_page_range(struct mmu_gather *tlb,
1285 struct vm_area_struct *vma,
1286 unsigned long addr, unsigned long end,
1287 struct zap_details *details)
1288{
1289 pgd_t *pgd;
1290 unsigned long next;
1291
1292 BUG_ON(addr >= end);
1293 tlb_start_vma(tlb, vma);
1294 pgd = pgd_offset(vma->vm_mm, addr);
1295 do {
1296 next = pgd_addr_end(addr, end);
1297 if (pgd_none_or_clear_bad(pgd))
1298 continue;
1299 next = zap_pud_range(tlb, vma, pgd, addr, next, details);
1300 } while (pgd++, addr = next, addr != end);
1301 tlb_end_vma(tlb, vma);
1302}
1303
1304
1305static void unmap_single_vma(struct mmu_gather *tlb,
1306 struct vm_area_struct *vma, unsigned long start_addr,
1307 unsigned long end_addr,
1308 struct zap_details *details)
1309{
1310 unsigned long start = max(vma->vm_start, start_addr);
1311 unsigned long end;
1312
1313 if (start >= vma->vm_end)
1314 return;
1315 end = min(vma->vm_end, end_addr);
1316 if (end <= vma->vm_start)
1317 return;
1318
1319 if (vma->vm_file)
1320 uprobe_munmap(vma, start, end);
1321
1322 if (unlikely(vma->vm_flags & VM_PFNMAP))
1323 untrack_pfn(vma, 0, 0);
1324
1325 if (start != end) {
1326 if (unlikely(is_vm_hugetlb_page(vma))) {
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338 if (vma->vm_file) {
1339 i_mmap_lock_write(vma->vm_file->f_mapping);
1340 __unmap_hugepage_range_final(tlb, vma, start, end, NULL);
1341 i_mmap_unlock_write(vma->vm_file->f_mapping);
1342 }
1343 } else
1344 unmap_page_range(tlb, vma, start, end, details);
1345 }
1346}
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366void unmap_vmas(struct mmu_gather *tlb,
1367 struct vm_area_struct *vma, unsigned long start_addr,
1368 unsigned long end_addr)
1369{
1370 struct mm_struct *mm = vma->vm_mm;
1371
1372 mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
1373 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
1374 unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
1375 mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
1376}
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387void zap_page_range(struct vm_area_struct *vma, unsigned long start,
1388 unsigned long size, struct zap_details *details)
1389{
1390 struct mm_struct *mm = vma->vm_mm;
1391 struct mmu_gather tlb;
1392 unsigned long end = start + size;
1393
1394 lru_add_drain();
1395 tlb_gather_mmu(&tlb, mm, start, end);
1396 update_hiwater_rss(mm);
1397 mmu_notifier_invalidate_range_start(mm, start, end);
1398 for ( ; vma && vma->vm_start < end; vma = vma->vm_next)
1399 unmap_single_vma(&tlb, vma, start, end, details);
1400 mmu_notifier_invalidate_range_end(mm, start, end);
1401 tlb_finish_mmu(&tlb, start, end);
1402}
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
1414 unsigned long size, struct zap_details *details)
1415{
1416 struct mm_struct *mm = vma->vm_mm;
1417 struct mmu_gather tlb;
1418 unsigned long end = address + size;
1419
1420 lru_add_drain();
1421 tlb_gather_mmu(&tlb, mm, address, end);
1422 update_hiwater_rss(mm);
1423 mmu_notifier_invalidate_range_start(mm, address, end);
1424 unmap_single_vma(&tlb, vma, address, end, details);
1425 mmu_notifier_invalidate_range_end(mm, address, end);
1426 tlb_finish_mmu(&tlb, address, end);
1427}
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
1442 unsigned long size)
1443{
1444 if (address < vma->vm_start || address + size > vma->vm_end ||
1445 !(vma->vm_flags & VM_PFNMAP))
1446 return -1;
1447 zap_page_range_single(vma, address, size, NULL);
1448 return 0;
1449}
1450EXPORT_SYMBOL_GPL(zap_vma_ptes);
1451
1452pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
1453 spinlock_t **ptl)
1454{
1455 pgd_t * pgd = pgd_offset(mm, addr);
1456 pud_t * pud = pud_alloc(mm, pgd, addr);
1457 if (pud) {
1458 pmd_t * pmd = pmd_alloc(mm, pud, addr);
1459 if (pmd) {
1460 VM_BUG_ON(pmd_trans_huge(*pmd));
1461 return pte_alloc_map_lock(mm, pmd, addr, ptl);
1462 }
1463 }
1464 return NULL;
1465}
1466
1467
1468
1469
1470
1471
1472
1473
1474static int insert_page(struct vm_area_struct *vma, unsigned long addr,
1475 struct page *page, pgprot_t prot)
1476{
1477 struct mm_struct *mm = vma->vm_mm;
1478 int retval;
1479 pte_t *pte;
1480 spinlock_t *ptl;
1481
1482 retval = -EINVAL;
1483 if (PageAnon(page))
1484 goto out;
1485 retval = -ENOMEM;
1486 flush_dcache_page(page);
1487 pte = get_locked_pte(mm, addr, &ptl);
1488 if (!pte)
1489 goto out;
1490 retval = -EBUSY;
1491 if (!pte_none(*pte))
1492 goto out_unlock;
1493
1494
1495 get_page(page);
1496 inc_mm_counter_fast(mm, mm_counter_file(page));
1497 page_add_file_rmap(page, false);
1498 set_pte_at(mm, addr, pte, mk_pte(page, prot));
1499
1500 retval = 0;
1501 pte_unmap_unlock(pte, ptl);
1502 return retval;
1503out_unlock:
1504 pte_unmap_unlock(pte, ptl);
1505out:
1506 return retval;
1507}
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
1537 struct page *page)
1538{
1539 if (addr < vma->vm_start || addr >= vma->vm_end)
1540 return -EFAULT;
1541 if (!page_count(page))
1542 return -EINVAL;
1543 if (!(vma->vm_flags & VM_MIXEDMAP)) {
1544 BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem));
1545 BUG_ON(vma->vm_flags & VM_PFNMAP);
1546 vma->vm_flags |= VM_MIXEDMAP;
1547 }
1548 return insert_page(vma, addr, page, vma->vm_page_prot);
1549}
1550EXPORT_SYMBOL(vm_insert_page);
1551
1552static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1553 pfn_t pfn, pgprot_t prot)
1554{
1555 struct mm_struct *mm = vma->vm_mm;
1556 int retval;
1557 pte_t *pte, entry;
1558 spinlock_t *ptl;
1559
1560 retval = -ENOMEM;
1561 pte = get_locked_pte(mm, addr, &ptl);
1562 if (!pte)
1563 goto out;
1564 retval = -EBUSY;
1565 if (!pte_none(*pte))
1566 goto out_unlock;
1567
1568
1569 if (pfn_t_devmap(pfn))
1570 entry = pte_mkdevmap(pfn_t_pte(pfn, prot));
1571 else
1572 entry = pte_mkspecial(pfn_t_pte(pfn, prot));
1573 set_pte_at(mm, addr, pte, entry);
1574 update_mmu_cache(vma, addr, pte);
1575
1576 retval = 0;
1577out_unlock:
1578 pte_unmap_unlock(pte, ptl);
1579out:
1580 return retval;
1581}
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1601 unsigned long pfn)
1602{
1603 return vm_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
1604}
1605EXPORT_SYMBOL(vm_insert_pfn);
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
1623 unsigned long pfn, pgprot_t pgprot)
1624{
1625 int ret;
1626
1627
1628
1629
1630
1631
1632 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
1633 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
1634 (VM_PFNMAP|VM_MIXEDMAP));
1635 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
1636 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
1637
1638 if (addr < vma->vm_start || addr >= vma->vm_end)
1639 return -EFAULT;
1640 if (track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV)))
1641 return -EINVAL;
1642
1643 ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot);
1644
1645 return ret;
1646}
1647EXPORT_SYMBOL(vm_insert_pfn_prot);
1648
1649int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
1650 pfn_t pfn)
1651{
1652 pgprot_t pgprot = vma->vm_page_prot;
1653
1654 BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
1655
1656 if (addr < vma->vm_start || addr >= vma->vm_end)
1657 return -EFAULT;
1658 if (track_pfn_insert(vma, &pgprot, pfn))
1659 return -EINVAL;
1660
1661
1662
1663
1664
1665
1666
1667
1668 if (!HAVE_PTE_SPECIAL && !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) {
1669 struct page *page;
1670
1671
1672
1673
1674
1675
1676 page = pfn_to_page(pfn_t_to_pfn(pfn));
1677 return insert_page(vma, addr, page, pgprot);
1678 }
1679 return insert_pfn(vma, addr, pfn, pgprot);
1680}
1681EXPORT_SYMBOL(vm_insert_mixed);
1682
1683
1684
1685
1686
1687
1688static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1689 unsigned long addr, unsigned long end,
1690 unsigned long pfn, pgprot_t prot)
1691{
1692 pte_t *pte;
1693 spinlock_t *ptl;
1694
1695 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
1696 if (!pte)
1697 return -ENOMEM;
1698 arch_enter_lazy_mmu_mode();
1699 do {
1700 BUG_ON(!pte_none(*pte));
1701 set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
1702 pfn++;
1703 } while (pte++, addr += PAGE_SIZE, addr != end);
1704 arch_leave_lazy_mmu_mode();
1705 pte_unmap_unlock(pte - 1, ptl);
1706 return 0;
1707}
1708
1709static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
1710 unsigned long addr, unsigned long end,
1711 unsigned long pfn, pgprot_t prot)
1712{
1713 pmd_t *pmd;
1714 unsigned long next;
1715
1716 pfn -= addr >> PAGE_SHIFT;
1717 pmd = pmd_alloc(mm, pud, addr);
1718 if (!pmd)
1719 return -ENOMEM;
1720 VM_BUG_ON(pmd_trans_huge(*pmd));
1721 do {
1722 next = pmd_addr_end(addr, end);
1723 if (remap_pte_range(mm, pmd, addr, next,
1724 pfn + (addr >> PAGE_SHIFT), prot))
1725 return -ENOMEM;
1726 } while (pmd++, addr = next, addr != end);
1727 return 0;
1728}
1729
1730static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
1731 unsigned long addr, unsigned long end,
1732 unsigned long pfn, pgprot_t prot)
1733{
1734 pud_t *pud;
1735 unsigned long next;
1736
1737 pfn -= addr >> PAGE_SHIFT;
1738 pud = pud_alloc(mm, pgd, addr);
1739 if (!pud)
1740 return -ENOMEM;
1741 do {
1742 next = pud_addr_end(addr, end);
1743 if (remap_pmd_range(mm, pud, addr, next,
1744 pfn + (addr >> PAGE_SHIFT), prot))
1745 return -ENOMEM;
1746 } while (pud++, addr = next, addr != end);
1747 return 0;
1748}
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1761 unsigned long pfn, unsigned long size, pgprot_t prot)
1762{
1763 pgd_t *pgd;
1764 unsigned long next;
1765 unsigned long end = addr + PAGE_ALIGN(size);
1766 struct mm_struct *mm = vma->vm_mm;
1767 unsigned long remap_pfn = pfn;
1768 int err;
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788 if (is_cow_mapping(vma->vm_flags)) {
1789 if (addr != vma->vm_start || end != vma->vm_end)
1790 return -EINVAL;
1791 vma->vm_pgoff = pfn;
1792 }
1793
1794 err = track_pfn_remap(vma, &prot, remap_pfn, addr, PAGE_ALIGN(size));
1795 if (err)
1796 return -EINVAL;
1797
1798 vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
1799
1800 BUG_ON(addr >= end);
1801 pfn -= addr >> PAGE_SHIFT;
1802 pgd = pgd_offset(mm, addr);
1803 flush_cache_range(vma, addr, end);
1804 do {
1805 next = pgd_addr_end(addr, end);
1806 err = remap_pud_range(mm, pgd, addr, next,
1807 pfn + (addr >> PAGE_SHIFT), prot);
1808 if (err)
1809 break;
1810 } while (pgd++, addr = next, addr != end);
1811
1812 if (err)
1813 untrack_pfn(vma, remap_pfn, PAGE_ALIGN(size));
1814
1815 return err;
1816}
1817EXPORT_SYMBOL(remap_pfn_range);
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
1833{
1834 unsigned long vm_len, pfn, pages;
1835
1836
1837 if (start + len < start)
1838 return -EINVAL;
1839
1840
1841
1842
1843
1844 len += start & ~PAGE_MASK;
1845 pfn = start >> PAGE_SHIFT;
1846 pages = (len + ~PAGE_MASK) >> PAGE_SHIFT;
1847 if (pfn + pages < pfn)
1848 return -EINVAL;
1849
1850
1851 if (vma->vm_pgoff > pages)
1852 return -EINVAL;
1853 pfn += vma->vm_pgoff;
1854 pages -= vma->vm_pgoff;
1855
1856
1857 vm_len = vma->vm_end - vma->vm_start;
1858 if (vm_len >> PAGE_SHIFT > pages)
1859 return -EINVAL;
1860
1861
1862 return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
1863}
1864EXPORT_SYMBOL(vm_iomap_memory);
1865
1866static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
1867 unsigned long addr, unsigned long end,
1868 pte_fn_t fn, void *data)
1869{
1870 pte_t *pte;
1871 int err;
1872 pgtable_t token;
1873 spinlock_t *uninitialized_var(ptl);
1874
1875 pte = (mm == &init_mm) ?
1876 pte_alloc_kernel(pmd, addr) :
1877 pte_alloc_map_lock(mm, pmd, addr, &ptl);
1878 if (!pte)
1879 return -ENOMEM;
1880
1881 BUG_ON(pmd_huge(*pmd));
1882
1883 arch_enter_lazy_mmu_mode();
1884
1885 token = pmd_pgtable(*pmd);
1886
1887 do {
1888 err = fn(pte++, token, addr, data);
1889 if (err)
1890 break;
1891 } while (addr += PAGE_SIZE, addr != end);
1892
1893 arch_leave_lazy_mmu_mode();
1894
1895 if (mm != &init_mm)
1896 pte_unmap_unlock(pte-1, ptl);
1897 return err;
1898}
1899
1900static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
1901 unsigned long addr, unsigned long end,
1902 pte_fn_t fn, void *data)
1903{
1904 pmd_t *pmd;
1905 unsigned long next;
1906 int err;
1907
1908 BUG_ON(pud_huge(*pud));
1909
1910 pmd = pmd_alloc(mm, pud, addr);
1911 if (!pmd)
1912 return -ENOMEM;
1913 do {
1914 next = pmd_addr_end(addr, end);
1915 err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
1916 if (err)
1917 break;
1918 } while (pmd++, addr = next, addr != end);
1919 return err;
1920}
1921
1922static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd,
1923 unsigned long addr, unsigned long end,
1924 pte_fn_t fn, void *data)
1925{
1926 pud_t *pud;
1927 unsigned long next;
1928 int err;
1929
1930 pud = pud_alloc(mm, pgd, addr);
1931 if (!pud)
1932 return -ENOMEM;
1933 do {
1934 next = pud_addr_end(addr, end);
1935 err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
1936 if (err)
1937 break;
1938 } while (pud++, addr = next, addr != end);
1939 return err;
1940}
1941
1942
1943
1944
1945
1946int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
1947 unsigned long size, pte_fn_t fn, void *data)
1948{
1949 pgd_t *pgd;
1950 unsigned long next;
1951 unsigned long end = addr + size;
1952 int err;
1953
1954 if (WARN_ON(addr >= end))
1955 return -EINVAL;
1956
1957 pgd = pgd_offset(mm, addr);
1958 do {
1959 next = pgd_addr_end(addr, end);
1960 err = apply_to_pud_range(mm, pgd, addr, next, fn, data);
1961 if (err)
1962 break;
1963 } while (pgd++, addr = next, addr != end);
1964
1965 return err;
1966}
1967EXPORT_SYMBOL_GPL(apply_to_page_range);
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
1978 pte_t *page_table, pte_t orig_pte)
1979{
1980 int same = 1;
1981#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
1982 if (sizeof(pte_t) > sizeof(unsigned long)) {
1983 spinlock_t *ptl = pte_lockptr(mm, pmd);
1984 spin_lock(ptl);
1985 same = pte_same(*page_table, orig_pte);
1986 spin_unlock(ptl);
1987 }
1988#endif
1989 pte_unmap(page_table);
1990 return same;
1991}
1992
1993static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
1994{
1995 debug_dma_assert_idle(src);
1996
1997
1998
1999
2000
2001
2002
2003 if (unlikely(!src)) {
2004 void *kaddr = kmap_atomic(dst);
2005 void __user *uaddr = (void __user *)(va & PAGE_MASK);
2006
2007
2008
2009
2010
2011
2012
2013 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
2014 clear_page(kaddr);
2015 kunmap_atomic(kaddr);
2016 flush_dcache_page(dst);
2017 } else
2018 copy_user_highpage(dst, src, va, vma);
2019}
2020
2021static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
2022{
2023 struct file *vm_file = vma->vm_file;
2024
2025 if (vm_file)
2026 return mapping_gfp_mask(vm_file->f_mapping) | __GFP_FS | __GFP_IO;
2027
2028
2029
2030
2031
2032 return GFP_KERNEL;
2033}
2034
2035
2036
2037
2038
2039
2040
2041static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
2042 unsigned long address)
2043{
2044 struct vm_fault vmf;
2045 int ret;
2046
2047 vmf.virtual_address = (void __user *)(address & PAGE_MASK);
2048 vmf.pgoff = page->index;
2049 vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
2050 vmf.gfp_mask = __get_fault_gfp_mask(vma);
2051 vmf.page = page;
2052 vmf.cow_page = NULL;
2053
2054 ret = vma->vm_ops->page_mkwrite(vma, &vmf);
2055 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
2056 return ret;
2057 if (unlikely(!(ret & VM_FAULT_LOCKED))) {
2058 lock_page(page);
2059 if (!page->mapping) {
2060 unlock_page(page);
2061 return 0;
2062 }
2063 ret |= VM_FAULT_LOCKED;
2064 } else
2065 VM_BUG_ON_PAGE(!PageLocked(page), page);
2066 return ret;
2067}
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte,
2078 struct page *page, int page_mkwrite, int dirty_shared)
2079 __releases(fe->ptl)
2080{
2081 struct vm_area_struct *vma = fe->vma;
2082 pte_t entry;
2083
2084
2085
2086
2087
2088 if (page)
2089 page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);
2090
2091 flush_cache_page(vma, fe->address, pte_pfn(orig_pte));
2092 entry = pte_mkyoung(orig_pte);
2093 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2094 if (ptep_set_access_flags(vma, fe->address, fe->pte, entry, 1))
2095 update_mmu_cache(vma, fe->address, fe->pte);
2096 pte_unmap_unlock(fe->pte, fe->ptl);
2097
2098 if (dirty_shared) {
2099 struct address_space *mapping;
2100 int dirtied;
2101
2102 if (!page_mkwrite)
2103 lock_page(page);
2104
2105 dirtied = set_page_dirty(page);
2106 VM_BUG_ON_PAGE(PageAnon(page), page);
2107 mapping = page->mapping;
2108 unlock_page(page);
2109 put_page(page);
2110
2111 if ((dirtied || page_mkwrite) && mapping) {
2112
2113
2114
2115
2116 balance_dirty_pages_ratelimited(mapping);
2117 }
2118
2119 if (!page_mkwrite)
2120 file_update_time(vma->vm_file);
2121 }
2122
2123 return VM_FAULT_WRITE;
2124}
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
2143 struct page *old_page)
2144{
2145 struct vm_area_struct *vma = fe->vma;
2146 struct mm_struct *mm = vma->vm_mm;
2147 struct page *new_page = NULL;
2148 pte_t entry;
2149 int page_copied = 0;
2150 const unsigned long mmun_start = fe->address & PAGE_MASK;
2151 const unsigned long mmun_end = mmun_start + PAGE_SIZE;
2152 struct mem_cgroup *memcg;
2153
2154 if (unlikely(anon_vma_prepare(vma)))
2155 goto oom;
2156
2157 if (is_zero_pfn(pte_pfn(orig_pte))) {
2158 new_page = alloc_zeroed_user_highpage_movable(vma, fe->address);
2159 if (!new_page)
2160 goto oom;
2161 } else {
2162 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
2163 fe->address);
2164 if (!new_page)
2165 goto oom;
2166 cow_user_page(new_page, old_page, fe->address, vma);
2167 }
2168
2169 if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false))
2170 goto oom_free_new;
2171
2172 __SetPageUptodate(new_page);
2173
2174 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2175
2176
2177
2178
2179 fe->pte = pte_offset_map_lock(mm, fe->pmd, fe->address, &fe->ptl);
2180 if (likely(pte_same(*fe->pte, orig_pte))) {
2181 if (old_page) {
2182 if (!PageAnon(old_page)) {
2183 dec_mm_counter_fast(mm,
2184 mm_counter_file(old_page));
2185 inc_mm_counter_fast(mm, MM_ANONPAGES);
2186 }
2187 } else {
2188 inc_mm_counter_fast(mm, MM_ANONPAGES);
2189 }
2190 flush_cache_page(vma, fe->address, pte_pfn(orig_pte));
2191 entry = mk_pte(new_page, vma->vm_page_prot);
2192 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2193
2194
2195
2196
2197
2198
2199 ptep_clear_flush_notify(vma, fe->address, fe->pte);
2200 page_add_new_anon_rmap(new_page, vma, fe->address, false);
2201 mem_cgroup_commit_charge(new_page, memcg, false, false);
2202 lru_cache_add_active_or_unevictable(new_page, vma);
2203
2204
2205
2206
2207
2208 set_pte_at_notify(mm, fe->address, fe->pte, entry);
2209 update_mmu_cache(vma, fe->address, fe->pte);
2210 if (old_page) {
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233 page_remove_rmap(old_page, false);
2234 }
2235
2236
2237 new_page = old_page;
2238 page_copied = 1;
2239 } else {
2240 mem_cgroup_cancel_charge(new_page, memcg, false);
2241 }
2242
2243 if (new_page)
2244 put_page(new_page);
2245
2246 pte_unmap_unlock(fe->pte, fe->ptl);
2247 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2248 if (old_page) {
2249
2250
2251
2252
2253 if (page_copied && (vma->vm_flags & VM_LOCKED)) {
2254 lock_page(old_page);
2255 if (PageMlocked(old_page))
2256 munlock_vma_page(old_page);
2257 unlock_page(old_page);
2258 }
2259 put_page(old_page);
2260 }
2261 return page_copied ? VM_FAULT_WRITE : 0;
2262oom_free_new:
2263 put_page(new_page);
2264oom:
2265 if (old_page)
2266 put_page(old_page);
2267 return VM_FAULT_OOM;
2268}
2269
2270
2271
2272
2273
2274static int wp_pfn_shared(struct fault_env *fe, pte_t orig_pte)
2275{
2276 struct vm_area_struct *vma = fe->vma;
2277
2278 if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
2279 struct vm_fault vmf = {
2280 .page = NULL,
2281 .pgoff = linear_page_index(vma, fe->address),
2282 .virtual_address =
2283 (void __user *)(fe->address & PAGE_MASK),
2284 .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE,
2285 };
2286 int ret;
2287
2288 pte_unmap_unlock(fe->pte, fe->ptl);
2289 ret = vma->vm_ops->pfn_mkwrite(vma, &vmf);
2290 if (ret & VM_FAULT_ERROR)
2291 return ret;
2292 fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
2293 &fe->ptl);
2294
2295
2296
2297
2298 if (!pte_same(*fe->pte, orig_pte)) {
2299 pte_unmap_unlock(fe->pte, fe->ptl);
2300 return 0;
2301 }
2302 }
2303 return wp_page_reuse(fe, orig_pte, NULL, 0, 0);
2304}
2305
2306static int wp_page_shared(struct fault_env *fe, pte_t orig_pte,
2307 struct page *old_page)
2308 __releases(fe->ptl)
2309{
2310 struct vm_area_struct *vma = fe->vma;
2311 int page_mkwrite = 0;
2312
2313 get_page(old_page);
2314
2315 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
2316 int tmp;
2317
2318 pte_unmap_unlock(fe->pte, fe->ptl);
2319 tmp = do_page_mkwrite(vma, old_page, fe->address);
2320 if (unlikely(!tmp || (tmp &
2321 (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
2322 put_page(old_page);
2323 return tmp;
2324 }
2325
2326
2327
2328
2329
2330
2331 fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
2332 &fe->ptl);
2333 if (!pte_same(*fe->pte, orig_pte)) {
2334 unlock_page(old_page);
2335 pte_unmap_unlock(fe->pte, fe->ptl);
2336 put_page(old_page);
2337 return 0;
2338 }
2339 page_mkwrite = 1;
2340 }
2341
2342 return wp_page_reuse(fe, orig_pte, old_page, page_mkwrite, 1);
2343}
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363static int do_wp_page(struct fault_env *fe, pte_t orig_pte)
2364 __releases(fe->ptl)
2365{
2366 struct vm_area_struct *vma = fe->vma;
2367 struct page *old_page;
2368
2369 old_page = vm_normal_page(vma, fe->address, orig_pte);
2370 if (!old_page) {
2371
2372
2373
2374
2375
2376
2377
2378 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2379 (VM_WRITE|VM_SHARED))
2380 return wp_pfn_shared(fe, orig_pte);
2381
2382 pte_unmap_unlock(fe->pte, fe->ptl);
2383 return wp_page_copy(fe, orig_pte, old_page);
2384 }
2385
2386
2387
2388
2389
2390 if (PageAnon(old_page) && !PageKsm(old_page)) {
2391 int total_mapcount;
2392 if (!trylock_page(old_page)) {
2393 get_page(old_page);
2394 pte_unmap_unlock(fe->pte, fe->ptl);
2395 lock_page(old_page);
2396 fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd,
2397 fe->address, &fe->ptl);
2398 if (!pte_same(*fe->pte, orig_pte)) {
2399 unlock_page(old_page);
2400 pte_unmap_unlock(fe->pte, fe->ptl);
2401 put_page(old_page);
2402 return 0;
2403 }
2404 put_page(old_page);
2405 }
2406 if (reuse_swap_page(old_page, &total_mapcount)) {
2407 if (total_mapcount == 1) {
2408
2409
2410
2411
2412
2413
2414
2415 page_move_anon_rmap(old_page, vma);
2416 }
2417 unlock_page(old_page);
2418 return wp_page_reuse(fe, orig_pte, old_page, 0, 0);
2419 }
2420 unlock_page(old_page);
2421 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2422 (VM_WRITE|VM_SHARED))) {
2423 return wp_page_shared(fe, orig_pte, old_page);
2424 }
2425
2426
2427
2428
2429 get_page(old_page);
2430
2431 pte_unmap_unlock(fe->pte, fe->ptl);
2432 return wp_page_copy(fe, orig_pte, old_page);
2433}
2434
2435static void unmap_mapping_range_vma(struct vm_area_struct *vma,
2436 unsigned long start_addr, unsigned long end_addr,
2437 struct zap_details *details)
2438{
2439 zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
2440}
2441
2442static inline void unmap_mapping_range_tree(struct rb_root *root,
2443 struct zap_details *details)
2444{
2445 struct vm_area_struct *vma;
2446 pgoff_t vba, vea, zba, zea;
2447
2448 vma_interval_tree_foreach(vma, root,
2449 details->first_index, details->last_index) {
2450
2451 vba = vma->vm_pgoff;
2452 vea = vba + vma_pages(vma) - 1;
2453 zba = details->first_index;
2454 if (zba < vba)
2455 zba = vba;
2456 zea = details->last_index;
2457 if (zea > vea)
2458 zea = vea;
2459
2460 unmap_mapping_range_vma(vma,
2461 ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
2462 ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
2463 details);
2464 }
2465}
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484void unmap_mapping_range(struct address_space *mapping,
2485 loff_t const holebegin, loff_t const holelen, int even_cows)
2486{
2487 struct zap_details details = { };
2488 pgoff_t hba = holebegin >> PAGE_SHIFT;
2489 pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2490
2491
2492 if (sizeof(holelen) > sizeof(hlen)) {
2493 long long holeend =
2494 (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2495 if (holeend & ~(long long)ULONG_MAX)
2496 hlen = ULONG_MAX - hba + 1;
2497 }
2498
2499 details.check_mapping = even_cows? NULL: mapping;
2500 details.first_index = hba;
2501 details.last_index = hba + hlen - 1;
2502 if (details.last_index < details.first_index)
2503 details.last_index = ULONG_MAX;
2504
2505 i_mmap_lock_write(mapping);
2506 if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
2507 unmap_mapping_range_tree(&mapping->i_mmap, &details);
2508 i_mmap_unlock_write(mapping);
2509}
2510EXPORT_SYMBOL(unmap_mapping_range);
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520int do_swap_page(struct fault_env *fe, pte_t orig_pte)
2521{
2522 struct vm_area_struct *vma = fe->vma;
2523 struct page *page, *swapcache;
2524 struct mem_cgroup *memcg;
2525 swp_entry_t entry;
2526 pte_t pte;
2527 int locked;
2528 int exclusive = 0;
2529 int ret = 0;
2530
2531 if (!pte_unmap_same(vma->vm_mm, fe->pmd, fe->pte, orig_pte))
2532 goto out;
2533
2534 entry = pte_to_swp_entry(orig_pte);
2535 if (unlikely(non_swap_entry(entry))) {
2536 if (is_migration_entry(entry)) {
2537 migration_entry_wait(vma->vm_mm, fe->pmd, fe->address);
2538 } else if (is_hwpoison_entry(entry)) {
2539 ret = VM_FAULT_HWPOISON;
2540 } else {
2541 print_bad_pte(vma, fe->address, orig_pte, NULL);
2542 ret = VM_FAULT_SIGBUS;
2543 }
2544 goto out;
2545 }
2546 delayacct_set_flag(DELAYACCT_PF_SWAPIN);
2547 page = lookup_swap_cache(entry);
2548 if (!page) {
2549 page = swapin_readahead(entry,
2550 GFP_HIGHUSER_MOVABLE, vma, fe->address);
2551 if (!page) {
2552
2553
2554
2555
2556 fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd,
2557 fe->address, &fe->ptl);
2558 if (likely(pte_same(*fe->pte, orig_pte)))
2559 ret = VM_FAULT_OOM;
2560 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2561 goto unlock;
2562 }
2563
2564
2565 ret = VM_FAULT_MAJOR;
2566 count_vm_event(PGMAJFAULT);
2567 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
2568 } else if (PageHWPoison(page)) {
2569
2570
2571
2572
2573 ret = VM_FAULT_HWPOISON;
2574 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2575 swapcache = page;
2576 goto out_release;
2577 }
2578
2579 swapcache = page;
2580 locked = lock_page_or_retry(page, vma->vm_mm, fe->flags);
2581
2582 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2583 if (!locked) {
2584 ret |= VM_FAULT_RETRY;
2585 goto out_release;
2586 }
2587
2588
2589
2590
2591
2592
2593
2594 if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
2595 goto out_page;
2596
2597 page = ksm_might_need_to_copy(page, vma, fe->address);
2598 if (unlikely(!page)) {
2599 ret = VM_FAULT_OOM;
2600 page = swapcache;
2601 goto out_page;
2602 }
2603
2604 if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL,
2605 &memcg, false)) {
2606 ret = VM_FAULT_OOM;
2607 goto out_page;
2608 }
2609
2610
2611
2612
2613 fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
2614 &fe->ptl);
2615 if (unlikely(!pte_same(*fe->pte, orig_pte)))
2616 goto out_nomap;
2617
2618 if (unlikely(!PageUptodate(page))) {
2619 ret = VM_FAULT_SIGBUS;
2620 goto out_nomap;
2621 }
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
2634 dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
2635 pte = mk_pte(page, vma->vm_page_prot);
2636 if ((fe->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
2637 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
2638 fe->flags &= ~FAULT_FLAG_WRITE;
2639 ret |= VM_FAULT_WRITE;
2640 exclusive = RMAP_EXCLUSIVE;
2641 }
2642 flush_icache_page(vma, page);
2643 if (pte_swp_soft_dirty(orig_pte))
2644 pte = pte_mksoft_dirty(pte);
2645 set_pte_at(vma->vm_mm, fe->address, fe->pte, pte);
2646 if (page == swapcache) {
2647 do_page_add_anon_rmap(page, vma, fe->address, exclusive);
2648 mem_cgroup_commit_charge(page, memcg, true, false);
2649 activate_page(page);
2650 } else {
2651 page_add_new_anon_rmap(page, vma, fe->address, false);
2652 mem_cgroup_commit_charge(page, memcg, false, false);
2653 lru_cache_add_active_or_unevictable(page, vma);
2654 }
2655
2656 swap_free(entry);
2657 if (mem_cgroup_swap_full(page) ||
2658 (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
2659 try_to_free_swap(page);
2660 unlock_page(page);
2661 if (page != swapcache) {
2662
2663
2664
2665
2666
2667
2668
2669
2670 unlock_page(swapcache);
2671 put_page(swapcache);
2672 }
2673
2674 if (fe->flags & FAULT_FLAG_WRITE) {
2675 ret |= do_wp_page(fe, pte);
2676 if (ret & VM_FAULT_ERROR)
2677 ret &= VM_FAULT_ERROR;
2678 goto out;
2679 }
2680
2681
2682 update_mmu_cache(vma, fe->address, fe->pte);
2683unlock:
2684 pte_unmap_unlock(fe->pte, fe->ptl);
2685out:
2686 return ret;
2687out_nomap:
2688 mem_cgroup_cancel_charge(page, memcg, false);
2689 pte_unmap_unlock(fe->pte, fe->ptl);
2690out_page:
2691 unlock_page(page);
2692out_release:
2693 put_page(page);
2694 if (page != swapcache) {
2695 unlock_page(swapcache);
2696 put_page(swapcache);
2697 }
2698 return ret;
2699}
2700
2701
2702
2703
2704
2705
2706static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address)
2707{
2708 address &= PAGE_MASK;
2709 if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) {
2710 struct vm_area_struct *prev = vma->vm_prev;
2711
2712
2713
2714
2715
2716
2717
2718 if (prev && prev->vm_end == address)
2719 return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM;
2720
2721 return expand_downwards(vma, address - PAGE_SIZE);
2722 }
2723 if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) {
2724 struct vm_area_struct *next = vma->vm_next;
2725
2726
2727 if (next && next->vm_start == address + PAGE_SIZE)
2728 return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM;
2729
2730 return expand_upwards(vma, address + PAGE_SIZE);
2731 }
2732 return 0;
2733}
2734
2735
2736
2737
2738
2739
2740static int do_anonymous_page(struct fault_env *fe)
2741{
2742 struct vm_area_struct *vma = fe->vma;
2743 struct mem_cgroup *memcg;
2744 struct page *page;
2745 pte_t entry;
2746
2747
2748 if (vma->vm_flags & VM_SHARED)
2749 return VM_FAULT_SIGBUS;
2750
2751
2752 if (check_stack_guard_page(vma, fe->address) < 0)
2753 return VM_FAULT_SIGSEGV;
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765 if (pte_alloc(vma->vm_mm, fe->pmd, fe->address))
2766 return VM_FAULT_OOM;
2767
2768
2769 if (unlikely(pmd_trans_unstable(fe->pmd)))
2770 return 0;
2771
2772
2773 if (!(fe->flags & FAULT_FLAG_WRITE) &&
2774 !mm_forbids_zeropage(vma->vm_mm)) {
2775 entry = pte_mkspecial(pfn_pte(my_zero_pfn(fe->address),
2776 vma->vm_page_prot));
2777 fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
2778 &fe->ptl);
2779 if (!pte_none(*fe->pte))
2780 goto unlock;
2781
2782 if (userfaultfd_missing(vma)) {
2783 pte_unmap_unlock(fe->pte, fe->ptl);
2784 return handle_userfault(fe, VM_UFFD_MISSING);
2785 }
2786 goto setpte;
2787 }
2788
2789
2790 if (unlikely(anon_vma_prepare(vma)))
2791 goto oom;
2792 page = alloc_zeroed_user_highpage_movable(vma, fe->address);
2793 if (!page)
2794 goto oom;
2795
2796 if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false))
2797 goto oom_free_page;
2798
2799
2800
2801
2802
2803
2804 __SetPageUptodate(page);
2805
2806 entry = mk_pte(page, vma->vm_page_prot);
2807 if (vma->vm_flags & VM_WRITE)
2808 entry = pte_mkwrite(pte_mkdirty(entry));
2809
2810 fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
2811 &fe->ptl);
2812 if (!pte_none(*fe->pte))
2813 goto release;
2814
2815
2816 if (userfaultfd_missing(vma)) {
2817 pte_unmap_unlock(fe->pte, fe->ptl);
2818 mem_cgroup_cancel_charge(page, memcg, false);
2819 put_page(page);
2820 return handle_userfault(fe, VM_UFFD_MISSING);
2821 }
2822
2823 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
2824 page_add_new_anon_rmap(page, vma, fe->address, false);
2825 mem_cgroup_commit_charge(page, memcg, false, false);
2826 lru_cache_add_active_or_unevictable(page, vma);
2827setpte:
2828 set_pte_at(vma->vm_mm, fe->address, fe->pte, entry);
2829
2830
2831 update_mmu_cache(vma, fe->address, fe->pte);
2832unlock:
2833 pte_unmap_unlock(fe->pte, fe->ptl);
2834 return 0;
2835release:
2836 mem_cgroup_cancel_charge(page, memcg, false);
2837 put_page(page);
2838 goto unlock;
2839oom_free_page:
2840 put_page(page);
2841oom:
2842 return VM_FAULT_OOM;
2843}
2844
2845
2846
2847
2848
2849
2850static int __do_fault(struct fault_env *fe, pgoff_t pgoff,
2851 struct page *cow_page, struct page **page, void **entry)
2852{
2853 struct vm_area_struct *vma = fe->vma;
2854 struct vm_fault vmf;
2855 int ret;
2856
2857 vmf.virtual_address = (void __user *)(fe->address & PAGE_MASK);
2858 vmf.pgoff = pgoff;
2859 vmf.flags = fe->flags;
2860 vmf.page = NULL;
2861 vmf.gfp_mask = __get_fault_gfp_mask(vma);
2862 vmf.cow_page = cow_page;
2863
2864 ret = vma->vm_ops->fault(vma, &vmf);
2865 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
2866 return ret;
2867 if (ret & VM_FAULT_DAX_LOCKED) {
2868 *entry = vmf.entry;
2869 return ret;
2870 }
2871
2872 if (unlikely(PageHWPoison(vmf.page))) {
2873 if (ret & VM_FAULT_LOCKED)
2874 unlock_page(vmf.page);
2875 put_page(vmf.page);
2876 return VM_FAULT_HWPOISON;
2877 }
2878
2879 if (unlikely(!(ret & VM_FAULT_LOCKED)))
2880 lock_page(vmf.page);
2881 else
2882 VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page);
2883
2884 *page = vmf.page;
2885 return ret;
2886}
2887
2888static int pte_alloc_one_map(struct fault_env *fe)
2889{
2890 struct vm_area_struct *vma = fe->vma;
2891
2892 if (!pmd_none(*fe->pmd))
2893 goto map_pte;
2894 if (fe->prealloc_pte) {
2895 fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
2896 if (unlikely(!pmd_none(*fe->pmd))) {
2897 spin_unlock(fe->ptl);
2898 goto map_pte;
2899 }
2900
2901 atomic_long_inc(&vma->vm_mm->nr_ptes);
2902 pmd_populate(vma->vm_mm, fe->pmd, fe->prealloc_pte);
2903 spin_unlock(fe->ptl);
2904 fe->prealloc_pte = 0;
2905 } else if (unlikely(pte_alloc(vma->vm_mm, fe->pmd, fe->address))) {
2906 return VM_FAULT_OOM;
2907 }
2908map_pte:
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920 if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd))
2921 return VM_FAULT_NOPAGE;
2922
2923 fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
2924 &fe->ptl);
2925 return 0;
2926}
2927
2928#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
2929
2930#define HPAGE_CACHE_INDEX_MASK (HPAGE_PMD_NR - 1)
2931static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
2932 unsigned long haddr)
2933{
2934 if (((vma->vm_start >> PAGE_SHIFT) & HPAGE_CACHE_INDEX_MASK) !=
2935 (vma->vm_pgoff & HPAGE_CACHE_INDEX_MASK))
2936 return false;
2937 if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
2938 return false;
2939 return true;
2940}
2941
2942static int do_set_pmd(struct fault_env *fe, struct page *page)
2943{
2944 struct vm_area_struct *vma = fe->vma;
2945 bool write = fe->flags & FAULT_FLAG_WRITE;
2946 unsigned long haddr = fe->address & HPAGE_PMD_MASK;
2947 pmd_t entry;
2948 int i, ret;
2949
2950 if (!transhuge_vma_suitable(vma, haddr))
2951 return VM_FAULT_FALLBACK;
2952
2953 ret = VM_FAULT_FALLBACK;
2954 page = compound_head(page);
2955
2956 fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
2957 if (unlikely(!pmd_none(*fe->pmd)))
2958 goto out;
2959
2960 for (i = 0; i < HPAGE_PMD_NR; i++)
2961 flush_icache_page(vma, page + i);
2962
2963 entry = mk_huge_pmd(page, vma->vm_page_prot);
2964 if (write)
2965 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
2966
2967 add_mm_counter(vma->vm_mm, MM_FILEPAGES, HPAGE_PMD_NR);
2968 page_add_file_rmap(page, true);
2969
2970 set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry);
2971
2972 update_mmu_cache_pmd(vma, haddr, fe->pmd);
2973
2974
2975 ret = 0;
2976 count_vm_event(THP_FILE_MAPPED);
2977out:
2978 spin_unlock(fe->ptl);
2979 return ret;
2980}
2981#else
2982static int do_set_pmd(struct fault_env *fe, struct page *page)
2983{
2984 BUILD_BUG();
2985 return 0;
2986}
2987#endif
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg,
3003 struct page *page)
3004{
3005 struct vm_area_struct *vma = fe->vma;
3006 bool write = fe->flags & FAULT_FLAG_WRITE;
3007 pte_t entry;
3008 int ret;
3009
3010 if (pmd_none(*fe->pmd) && PageTransCompound(page) &&
3011 IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) {
3012
3013 VM_BUG_ON_PAGE(memcg, page);
3014
3015 ret = do_set_pmd(fe, page);
3016 if (ret != VM_FAULT_FALLBACK)
3017 return ret;
3018 }
3019
3020 if (!fe->pte) {
3021 ret = pte_alloc_one_map(fe);
3022 if (ret)
3023 return ret;
3024 }
3025
3026
3027 if (unlikely(!pte_none(*fe->pte)))
3028 return VM_FAULT_NOPAGE;
3029
3030 flush_icache_page(vma, page);
3031 entry = mk_pte(page, vma->vm_page_prot);
3032 if (write)
3033 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
3034
3035 if (write && !(vma->vm_flags & VM_SHARED)) {
3036 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
3037 page_add_new_anon_rmap(page, vma, fe->address, false);
3038 mem_cgroup_commit_charge(page, memcg, false, false);
3039 lru_cache_add_active_or_unevictable(page, vma);
3040 } else {
3041 inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
3042 page_add_file_rmap(page, false);
3043 }
3044 set_pte_at(vma->vm_mm, fe->address, fe->pte, entry);
3045
3046
3047 update_mmu_cache(vma, fe->address, fe->pte);
3048
3049 return 0;
3050}
3051
3052static unsigned long fault_around_bytes __read_mostly =
3053 rounddown_pow_of_two(65536);
3054
3055#ifdef CONFIG_DEBUG_FS
3056static int fault_around_bytes_get(void *data, u64 *val)
3057{
3058 *val = fault_around_bytes;
3059 return 0;
3060}
3061
3062
3063
3064
3065
3066
3067static int fault_around_bytes_set(void *data, u64 val)
3068{
3069 if (val / PAGE_SIZE > PTRS_PER_PTE)
3070 return -EINVAL;
3071 if (val > PAGE_SIZE)
3072 fault_around_bytes = rounddown_pow_of_two(val);
3073 else
3074 fault_around_bytes = PAGE_SIZE;
3075 return 0;
3076}
3077DEFINE_SIMPLE_ATTRIBUTE(fault_around_bytes_fops,
3078 fault_around_bytes_get, fault_around_bytes_set, "%llu\n");
3079
3080static int __init fault_around_debugfs(void)
3081{
3082 void *ret;
3083
3084 ret = debugfs_create_file("fault_around_bytes", 0644, NULL, NULL,
3085 &fault_around_bytes_fops);
3086 if (!ret)
3087 pr_warn("Failed to create fault_around_bytes in debugfs");
3088 return 0;
3089}
3090late_initcall(fault_around_debugfs);
3091#endif
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff)
3117{
3118 unsigned long address = fe->address, nr_pages, mask;
3119 pgoff_t end_pgoff;
3120 int off, ret = 0;
3121
3122 nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
3123 mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
3124
3125 fe->address = max(address & mask, fe->vma->vm_start);
3126 off = ((address - fe->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
3127 start_pgoff -= off;
3128
3129
3130
3131
3132
3133 end_pgoff = start_pgoff -
3134 ((fe->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
3135 PTRS_PER_PTE - 1;
3136 end_pgoff = min3(end_pgoff, vma_pages(fe->vma) + fe->vma->vm_pgoff - 1,
3137 start_pgoff + nr_pages - 1);
3138
3139 if (pmd_none(*fe->pmd)) {
3140 fe->prealloc_pte = pte_alloc_one(fe->vma->vm_mm, fe->address);
3141 if (!fe->prealloc_pte)
3142 goto out;
3143 smp_wmb();
3144 }
3145
3146 fe->vma->vm_ops->map_pages(fe, start_pgoff, end_pgoff);
3147
3148
3149 if (fe->prealloc_pte) {
3150 pte_free(fe->vma->vm_mm, fe->prealloc_pte);
3151 fe->prealloc_pte = 0;
3152 }
3153
3154 if (pmd_trans_huge(*fe->pmd)) {
3155 ret = VM_FAULT_NOPAGE;
3156 goto out;
3157 }
3158
3159
3160 if (!fe->pte)
3161 goto out;
3162
3163
3164 fe->pte -= (fe->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT);
3165 if (!pte_none(*fe->pte))
3166 ret = VM_FAULT_NOPAGE;
3167 pte_unmap_unlock(fe->pte, fe->ptl);
3168out:
3169 fe->address = address;
3170 fe->pte = NULL;
3171 return ret;
3172}
3173
3174static int do_read_fault(struct fault_env *fe, pgoff_t pgoff)
3175{
3176 struct vm_area_struct *vma = fe->vma;
3177 struct page *fault_page;
3178 int ret = 0;
3179
3180
3181
3182
3183
3184
3185 if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
3186 ret = do_fault_around(fe, pgoff);
3187 if (ret)
3188 return ret;
3189 }
3190
3191 ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL);
3192 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3193 return ret;
3194
3195 ret |= alloc_set_pte(fe, NULL, fault_page);
3196 if (fe->pte)
3197 pte_unmap_unlock(fe->pte, fe->ptl);
3198 unlock_page(fault_page);
3199 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3200 put_page(fault_page);
3201 return ret;
3202}
3203
3204static int do_cow_fault(struct fault_env *fe, pgoff_t pgoff)
3205{
3206 struct vm_area_struct *vma = fe->vma;
3207 struct page *fault_page, *new_page;
3208 void *fault_entry;
3209 struct mem_cgroup *memcg;
3210 int ret;
3211
3212 if (unlikely(anon_vma_prepare(vma)))
3213 return VM_FAULT_OOM;
3214
3215 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, fe->address);
3216 if (!new_page)
3217 return VM_FAULT_OOM;
3218
3219 if (mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL,
3220 &memcg, false)) {
3221 put_page(new_page);
3222 return VM_FAULT_OOM;
3223 }
3224
3225 ret = __do_fault(fe, pgoff, new_page, &fault_page, &fault_entry);
3226 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3227 goto uncharge_out;
3228
3229 if (!(ret & VM_FAULT_DAX_LOCKED))
3230 copy_user_highpage(new_page, fault_page, fe->address, vma);
3231 __SetPageUptodate(new_page);
3232
3233 ret |= alloc_set_pte(fe, memcg, new_page);
3234 if (fe->pte)
3235 pte_unmap_unlock(fe->pte, fe->ptl);
3236 if (!(ret & VM_FAULT_DAX_LOCKED)) {
3237 unlock_page(fault_page);
3238 put_page(fault_page);
3239 } else {
3240 dax_unlock_mapping_entry(vma->vm_file->f_mapping, pgoff);
3241 }
3242 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3243 goto uncharge_out;
3244 return ret;
3245uncharge_out:
3246 mem_cgroup_cancel_charge(new_page, memcg, false);
3247 put_page(new_page);
3248 return ret;
3249}
3250
3251static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff)
3252{
3253 struct vm_area_struct *vma = fe->vma;
3254 struct page *fault_page;
3255 struct address_space *mapping;
3256 int dirtied = 0;
3257 int ret, tmp;
3258
3259 ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL);
3260 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3261 return ret;
3262
3263
3264
3265
3266
3267 if (vma->vm_ops->page_mkwrite) {
3268 unlock_page(fault_page);
3269 tmp = do_page_mkwrite(vma, fault_page, fe->address);
3270 if (unlikely(!tmp ||
3271 (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
3272 put_page(fault_page);
3273 return tmp;
3274 }
3275 }
3276
3277 ret |= alloc_set_pte(fe, NULL, fault_page);
3278 if (fe->pte)
3279 pte_unmap_unlock(fe->pte, fe->ptl);
3280 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
3281 VM_FAULT_RETRY))) {
3282 unlock_page(fault_page);
3283 put_page(fault_page);
3284 return ret;
3285 }
3286
3287 if (set_page_dirty(fault_page))
3288 dirtied = 1;
3289
3290
3291
3292
3293
3294
3295 mapping = page_rmapping(fault_page);
3296 unlock_page(fault_page);
3297 if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) {
3298
3299
3300
3301
3302 balance_dirty_pages_ratelimited(mapping);
3303 }
3304
3305 if (!vma->vm_ops->page_mkwrite)
3306 file_update_time(vma->vm_file);
3307
3308 return ret;
3309}
3310
3311
3312
3313
3314
3315
3316
3317static int do_fault(struct fault_env *fe)
3318{
3319 struct vm_area_struct *vma = fe->vma;
3320 pgoff_t pgoff = linear_page_index(vma, fe->address);
3321
3322
3323 if (!vma->vm_ops->fault)
3324 return VM_FAULT_SIGBUS;
3325 if (!(fe->flags & FAULT_FLAG_WRITE))
3326 return do_read_fault(fe, pgoff);
3327 if (!(vma->vm_flags & VM_SHARED))
3328 return do_cow_fault(fe, pgoff);
3329 return do_shared_fault(fe, pgoff);
3330}
3331
3332static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
3333 unsigned long addr, int page_nid,
3334 int *flags)
3335{
3336 get_page(page);
3337
3338 count_vm_numa_event(NUMA_HINT_FAULTS);
3339 if (page_nid == numa_node_id()) {
3340 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
3341 *flags |= TNF_FAULT_LOCAL;
3342 }
3343
3344 return mpol_misplaced(page, vma, addr);
3345}
3346
3347static int do_numa_page(struct fault_env *fe, pte_t pte)
3348{
3349 struct vm_area_struct *vma = fe->vma;
3350 struct page *page = NULL;
3351 int page_nid = -1;
3352 int last_cpupid;
3353 int target_nid;
3354 bool migrated = false;
3355 bool was_writable = pte_write(pte);
3356 int flags = 0;
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367 fe->ptl = pte_lockptr(vma->vm_mm, fe->pmd);
3368 spin_lock(fe->ptl);
3369 if (unlikely(!pte_same(*fe->pte, pte))) {
3370 pte_unmap_unlock(fe->pte, fe->ptl);
3371 goto out;
3372 }
3373
3374
3375 pte = pte_modify(pte, vma->vm_page_prot);
3376 pte = pte_mkyoung(pte);
3377 if (was_writable)
3378 pte = pte_mkwrite(pte);
3379 set_pte_at(vma->vm_mm, fe->address, fe->pte, pte);
3380 update_mmu_cache(vma, fe->address, fe->pte);
3381
3382 page = vm_normal_page(vma, fe->address, pte);
3383 if (!page) {
3384 pte_unmap_unlock(fe->pte, fe->ptl);
3385 return 0;
3386 }
3387
3388
3389 if (PageCompound(page)) {
3390 pte_unmap_unlock(fe->pte, fe->ptl);
3391 return 0;
3392 }
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402 if (!pte_write(pte))
3403 flags |= TNF_NO_GROUP;
3404
3405
3406
3407
3408
3409 if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
3410 flags |= TNF_SHARED;
3411
3412 last_cpupid = page_cpupid_last(page);
3413 page_nid = page_to_nid(page);
3414 target_nid = numa_migrate_prep(page, vma, fe->address, page_nid,
3415 &flags);
3416 pte_unmap_unlock(fe->pte, fe->ptl);
3417 if (target_nid == -1) {
3418 put_page(page);
3419 goto out;
3420 }
3421
3422
3423 migrated = migrate_misplaced_page(page, vma, target_nid);
3424 if (migrated) {
3425 page_nid = target_nid;
3426 flags |= TNF_MIGRATED;
3427 } else
3428 flags |= TNF_MIGRATE_FAIL;
3429
3430out:
3431 if (page_nid != -1)
3432 task_numa_fault(last_cpupid, page_nid, 1, flags);
3433 return 0;
3434}
3435
3436static int create_huge_pmd(struct fault_env *fe)
3437{
3438 struct vm_area_struct *vma = fe->vma;
3439 if (vma_is_anonymous(vma))
3440 return do_huge_pmd_anonymous_page(fe);
3441 if (vma->vm_ops->pmd_fault)
3442 return vma->vm_ops->pmd_fault(vma, fe->address, fe->pmd,
3443 fe->flags);
3444 return VM_FAULT_FALLBACK;
3445}
3446
3447static int wp_huge_pmd(struct fault_env *fe, pmd_t orig_pmd)
3448{
3449 if (vma_is_anonymous(fe->vma))
3450 return do_huge_pmd_wp_page(fe, orig_pmd);
3451 if (fe->vma->vm_ops->pmd_fault)
3452 return fe->vma->vm_ops->pmd_fault(fe->vma, fe->address, fe->pmd,
3453 fe->flags);
3454
3455
3456 VM_BUG_ON_VMA(fe->vma->vm_flags & VM_SHARED, fe->vma);
3457 split_huge_pmd(fe->vma, fe->pmd, fe->address);
3458
3459 return VM_FAULT_FALLBACK;
3460}
3461
3462static inline bool vma_is_accessible(struct vm_area_struct *vma)
3463{
3464 return vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE);
3465}
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482static int handle_pte_fault(struct fault_env *fe)
3483{
3484 pte_t entry;
3485
3486 if (unlikely(pmd_none(*fe->pmd))) {
3487
3488
3489
3490
3491
3492
3493 fe->pte = NULL;
3494 } else {
3495
3496 if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd))
3497 return 0;
3498
3499
3500
3501
3502
3503
3504 fe->pte = pte_offset_map(fe->pmd, fe->address);
3505
3506 entry = *fe->pte;
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516 barrier();
3517 if (pte_none(entry)) {
3518 pte_unmap(fe->pte);
3519 fe->pte = NULL;
3520 }
3521 }
3522
3523 if (!fe->pte) {
3524 if (vma_is_anonymous(fe->vma))
3525 return do_anonymous_page(fe);
3526 else
3527 return do_fault(fe);
3528 }
3529
3530 if (!pte_present(entry))
3531 return do_swap_page(fe, entry);
3532
3533 if (pte_protnone(entry) && vma_is_accessible(fe->vma))
3534 return do_numa_page(fe, entry);
3535
3536 fe->ptl = pte_lockptr(fe->vma->vm_mm, fe->pmd);
3537 spin_lock(fe->ptl);
3538 if (unlikely(!pte_same(*fe->pte, entry)))
3539 goto unlock;
3540 if (fe->flags & FAULT_FLAG_WRITE) {
3541 if (!pte_write(entry))
3542 return do_wp_page(fe, entry);
3543 entry = pte_mkdirty(entry);
3544 }
3545 entry = pte_mkyoung(entry);
3546 if (ptep_set_access_flags(fe->vma, fe->address, fe->pte, entry,
3547 fe->flags & FAULT_FLAG_WRITE)) {
3548 update_mmu_cache(fe->vma, fe->address, fe->pte);
3549 } else {
3550
3551
3552
3553
3554
3555
3556 if (fe->flags & FAULT_FLAG_WRITE)
3557 flush_tlb_fix_spurious_fault(fe->vma, fe->address);
3558 }
3559unlock:
3560 pte_unmap_unlock(fe->pte, fe->ptl);
3561 return 0;
3562}
3563
3564
3565
3566
3567
3568
3569
3570static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
3571 unsigned int flags)
3572{
3573 struct fault_env fe = {
3574 .vma = vma,
3575 .address = address,
3576 .flags = flags,
3577 };
3578 struct mm_struct *mm = vma->vm_mm;
3579 pgd_t *pgd;
3580 pud_t *pud;
3581
3582 pgd = pgd_offset(mm, address);
3583 pud = pud_alloc(mm, pgd, address);
3584 if (!pud)
3585 return VM_FAULT_OOM;
3586 fe.pmd = pmd_alloc(mm, pud, address);
3587 if (!fe.pmd)
3588 return VM_FAULT_OOM;
3589 if (pmd_none(*fe.pmd) && transparent_hugepage_enabled(vma)) {
3590 int ret = create_huge_pmd(&fe);
3591 if (!(ret & VM_FAULT_FALLBACK))
3592 return ret;
3593 } else {
3594 pmd_t orig_pmd = *fe.pmd;
3595 int ret;
3596
3597 barrier();
3598 if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
3599 if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
3600 return do_huge_pmd_numa_page(&fe, orig_pmd);
3601
3602 if ((fe.flags & FAULT_FLAG_WRITE) &&
3603 !pmd_write(orig_pmd)) {
3604 ret = wp_huge_pmd(&fe, orig_pmd);
3605 if (!(ret & VM_FAULT_FALLBACK))
3606 return ret;
3607 } else {
3608 huge_pmd_set_accessed(&fe, orig_pmd);
3609 return 0;
3610 }
3611 }
3612 }
3613
3614 return handle_pte_fault(&fe);
3615}
3616
3617
3618
3619
3620
3621
3622
3623int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
3624 unsigned int flags)
3625{
3626 int ret;
3627
3628 __set_current_state(TASK_RUNNING);
3629
3630 count_vm_event(PGFAULT);
3631 mem_cgroup_count_vm_event(vma->vm_mm, PGFAULT);
3632
3633
3634 check_sync_rss_stat(current);
3635
3636
3637
3638
3639
3640 if (flags & FAULT_FLAG_USER)
3641 mem_cgroup_oom_enable();
3642
3643 if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
3644 flags & FAULT_FLAG_INSTRUCTION,
3645 flags & FAULT_FLAG_REMOTE))
3646 return VM_FAULT_SIGSEGV;
3647
3648 if (unlikely(is_vm_hugetlb_page(vma)))
3649 ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
3650 else
3651 ret = __handle_mm_fault(vma, address, flags);
3652
3653 if (flags & FAULT_FLAG_USER) {
3654 mem_cgroup_oom_disable();
3655
3656
3657
3658
3659
3660
3661 if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
3662 mem_cgroup_oom_synchronize(false);
3663 }
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674 if (unlikely((current->flags & PF_KTHREAD) && !(ret & VM_FAULT_ERROR)
3675 && test_bit(MMF_UNSTABLE, &vma->vm_mm->flags)))
3676 ret = VM_FAULT_SIGBUS;
3677
3678 return ret;
3679}
3680EXPORT_SYMBOL_GPL(handle_mm_fault);
3681
3682#ifndef __PAGETABLE_PUD_FOLDED
3683
3684
3685
3686
3687int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
3688{
3689 pud_t *new = pud_alloc_one(mm, address);
3690 if (!new)
3691 return -ENOMEM;
3692
3693 smp_wmb();
3694
3695 spin_lock(&mm->page_table_lock);
3696 if (pgd_present(*pgd))
3697 pud_free(mm, new);
3698 else
3699 pgd_populate(mm, pgd, new);
3700 spin_unlock(&mm->page_table_lock);
3701 return 0;
3702}
3703#endif
3704
3705#ifndef __PAGETABLE_PMD_FOLDED
3706
3707
3708
3709
3710int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
3711{
3712 pmd_t *new = pmd_alloc_one(mm, address);
3713 if (!new)
3714 return -ENOMEM;
3715
3716 smp_wmb();
3717
3718 spin_lock(&mm->page_table_lock);
3719#ifndef __ARCH_HAS_4LEVEL_HACK
3720 if (!pud_present(*pud)) {
3721 mm_inc_nr_pmds(mm);
3722 pud_populate(mm, pud, new);
3723 } else
3724 pmd_free(mm, new);
3725#else
3726 if (!pgd_present(*pud)) {
3727 mm_inc_nr_pmds(mm);
3728 pgd_populate(mm, pud, new);
3729 } else
3730 pmd_free(mm, new);
3731#endif
3732 spin_unlock(&mm->page_table_lock);
3733 return 0;
3734}
3735#endif
3736
3737static int __follow_pte(struct mm_struct *mm, unsigned long address,
3738 pte_t **ptepp, spinlock_t **ptlp)
3739{
3740 pgd_t *pgd;
3741 pud_t *pud;
3742 pmd_t *pmd;
3743 pte_t *ptep;
3744
3745 pgd = pgd_offset(mm, address);
3746 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
3747 goto out;
3748
3749 pud = pud_offset(pgd, address);
3750 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
3751 goto out;
3752
3753 pmd = pmd_offset(pud, address);
3754 VM_BUG_ON(pmd_trans_huge(*pmd));
3755 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
3756 goto out;
3757
3758
3759 if (pmd_huge(*pmd))
3760 goto out;
3761
3762 ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
3763 if (!ptep)
3764 goto out;
3765 if (!pte_present(*ptep))
3766 goto unlock;
3767 *ptepp = ptep;
3768 return 0;
3769unlock:
3770 pte_unmap_unlock(ptep, *ptlp);
3771out:
3772 return -EINVAL;
3773}
3774
3775static inline int follow_pte(struct mm_struct *mm, unsigned long address,
3776 pte_t **ptepp, spinlock_t **ptlp)
3777{
3778 int res;
3779
3780
3781 (void) __cond_lock(*ptlp,
3782 !(res = __follow_pte(mm, address, ptepp, ptlp)));
3783 return res;
3784}
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796int follow_pfn(struct vm_area_struct *vma, unsigned long address,
3797 unsigned long *pfn)
3798{
3799 int ret = -EINVAL;
3800 spinlock_t *ptl;
3801 pte_t *ptep;
3802
3803 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
3804 return ret;
3805
3806 ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
3807 if (ret)
3808 return ret;
3809 *pfn = pte_pfn(*ptep);
3810 pte_unmap_unlock(ptep, ptl);
3811 return 0;
3812}
3813EXPORT_SYMBOL(follow_pfn);
3814
3815#ifdef CONFIG_HAVE_IOREMAP_PROT
3816int follow_phys(struct vm_area_struct *vma,
3817 unsigned long address, unsigned int flags,
3818 unsigned long *prot, resource_size_t *phys)
3819{
3820 int ret = -EINVAL;
3821 pte_t *ptep, pte;
3822 spinlock_t *ptl;
3823
3824 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
3825 goto out;
3826
3827 if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
3828 goto out;
3829 pte = *ptep;
3830
3831 if ((flags & FOLL_WRITE) && !pte_write(pte))
3832 goto unlock;
3833
3834 *prot = pgprot_val(pte_pgprot(pte));
3835 *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
3836
3837 ret = 0;
3838unlock:
3839 pte_unmap_unlock(ptep, ptl);
3840out:
3841 return ret;
3842}
3843
3844int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
3845 void *buf, int len, int write)
3846{
3847 resource_size_t phys_addr;
3848 unsigned long prot = 0;
3849 void __iomem *maddr;
3850 int offset = addr & (PAGE_SIZE-1);
3851
3852 if (follow_phys(vma, addr, write, &prot, &phys_addr))
3853 return -EINVAL;
3854
3855 maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
3856 if (write)
3857 memcpy_toio(maddr + offset, buf, len);
3858 else
3859 memcpy_fromio(buf, maddr + offset, len);
3860 iounmap(maddr);
3861
3862 return len;
3863}
3864EXPORT_SYMBOL_GPL(generic_access_phys);
3865#endif
3866
3867
3868
3869
3870
3871static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
3872 unsigned long addr, void *buf, int len, unsigned int gup_flags)
3873{
3874 struct vm_area_struct *vma;
3875 void *old_buf = buf;
3876 int write = gup_flags & FOLL_WRITE;
3877
3878 down_read(&mm->mmap_sem);
3879
3880 while (len) {
3881 int bytes, ret, offset;
3882 void *maddr;
3883 struct page *page = NULL;
3884
3885 ret = get_user_pages_remote(tsk, mm, addr, 1,
3886 gup_flags, &page, &vma);
3887 if (ret <= 0) {
3888#ifndef CONFIG_HAVE_IOREMAP_PROT
3889 break;
3890#else
3891
3892
3893
3894
3895 vma = find_vma(mm, addr);
3896 if (!vma || vma->vm_start > addr)
3897 break;
3898 if (vma->vm_ops && vma->vm_ops->access)
3899 ret = vma->vm_ops->access(vma, addr, buf,
3900 len, write);
3901 if (ret <= 0)
3902 break;
3903 bytes = ret;
3904#endif
3905 } else {
3906 bytes = len;
3907 offset = addr & (PAGE_SIZE-1);
3908 if (bytes > PAGE_SIZE-offset)
3909 bytes = PAGE_SIZE-offset;
3910
3911 maddr = kmap(page);
3912 if (write) {
3913 copy_to_user_page(vma, page, addr,
3914 maddr + offset, buf, bytes);
3915 set_page_dirty_lock(page);
3916 } else {
3917 copy_from_user_page(vma, page, addr,
3918 buf, maddr + offset, bytes);
3919 }
3920 kunmap(page);
3921 put_page(page);
3922 }
3923 len -= bytes;
3924 buf += bytes;
3925 addr += bytes;
3926 }
3927 up_read(&mm->mmap_sem);
3928
3929 return buf - old_buf;
3930}
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942int access_remote_vm(struct mm_struct *mm, unsigned long addr,
3943 void *buf, int len, unsigned int gup_flags)
3944{
3945 return __access_remote_vm(NULL, mm, addr, buf, len, gup_flags);
3946}
3947
3948
3949
3950
3951
3952
3953int access_process_vm(struct task_struct *tsk, unsigned long addr,
3954 void *buf, int len, unsigned int gup_flags)
3955{
3956 struct mm_struct *mm;
3957 int ret;
3958
3959 mm = get_task_mm(tsk);
3960 if (!mm)
3961 return 0;
3962
3963 ret = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags);
3964
3965 mmput(mm);
3966
3967 return ret;
3968}
3969
3970
3971
3972
3973void print_vma_addr(char *prefix, unsigned long ip)
3974{
3975 struct mm_struct *mm = current->mm;
3976 struct vm_area_struct *vma;
3977
3978
3979
3980
3981
3982 if (preempt_count())
3983 return;
3984
3985 down_read(&mm->mmap_sem);
3986 vma = find_vma(mm, ip);
3987 if (vma && vma->vm_file) {
3988 struct file *f = vma->vm_file;
3989 char *buf = (char *)__get_free_page(GFP_KERNEL);
3990 if (buf) {
3991 char *p;
3992
3993 p = file_path(f, buf, PAGE_SIZE);
3994 if (IS_ERR(p))
3995 p = "?";
3996 printk("%s%s[%lx+%lx]", prefix, kbasename(p),
3997 vma->vm_start,
3998 vma->vm_end - vma->vm_start);
3999 free_page((unsigned long)buf);
4000 }
4001 }
4002 up_read(&mm->mmap_sem);
4003}
4004
4005#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
4006void __might_fault(const char *file, int line)
4007{
4008
4009
4010
4011
4012
4013
4014 if (segment_eq(get_fs(), KERNEL_DS))
4015 return;
4016 if (pagefault_disabled())
4017 return;
4018 __might_sleep(file, line, 0);
4019#if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
4020 if (current->mm)
4021 might_lock_read(¤t->mm->mmap_sem);
4022#endif
4023}
4024EXPORT_SYMBOL(__might_fault);
4025#endif
4026
4027#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
4028static void clear_gigantic_page(struct page *page,
4029 unsigned long addr,
4030 unsigned int pages_per_huge_page)
4031{
4032 int i;
4033 struct page *p = page;
4034
4035 might_sleep();
4036 for (i = 0; i < pages_per_huge_page;
4037 i++, p = mem_map_next(p, page, i)) {
4038 cond_resched();
4039 clear_user_highpage(p, addr + i * PAGE_SIZE);
4040 }
4041}
4042void clear_huge_page(struct page *page,
4043 unsigned long addr, unsigned int pages_per_huge_page)
4044{
4045 int i;
4046
4047 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
4048 clear_gigantic_page(page, addr, pages_per_huge_page);
4049 return;
4050 }
4051
4052 might_sleep();
4053 for (i = 0; i < pages_per_huge_page; i++) {
4054 cond_resched();
4055 clear_user_highpage(page + i, addr + i * PAGE_SIZE);
4056 }
4057}
4058
4059static void copy_user_gigantic_page(struct page *dst, struct page *src,
4060 unsigned long addr,
4061 struct vm_area_struct *vma,
4062 unsigned int pages_per_huge_page)
4063{
4064 int i;
4065 struct page *dst_base = dst;
4066 struct page *src_base = src;
4067
4068 for (i = 0; i < pages_per_huge_page; ) {
4069 cond_resched();
4070 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
4071
4072 i++;
4073 dst = mem_map_next(dst, dst_base, i);
4074 src = mem_map_next(src, src_base, i);
4075 }
4076}
4077
4078void copy_user_huge_page(struct page *dst, struct page *src,
4079 unsigned long addr, struct vm_area_struct *vma,
4080 unsigned int pages_per_huge_page)
4081{
4082 int i;
4083
4084 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
4085 copy_user_gigantic_page(dst, src, addr, vma,
4086 pages_per_huge_page);
4087 return;
4088 }
4089
4090 might_sleep();
4091 for (i = 0; i < pages_per_huge_page; i++) {
4092 cond_resched();
4093 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
4094 }
4095}
4096#endif
4097
4098#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS
4099
4100static struct kmem_cache *page_ptl_cachep;
4101
4102void __init ptlock_cache_init(void)
4103{
4104 page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0,
4105 SLAB_PANIC, NULL);
4106}
4107
4108bool ptlock_alloc(struct page *page)
4109{
4110 spinlock_t *ptl;
4111
4112 ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
4113 if (!ptl)
4114 return false;
4115 page->ptl = ptl;
4116 return true;
4117}
4118
4119void ptlock_free(struct page *page)
4120{
4121 kmem_cache_free(page_ptl_cachep, page->ptl);
4122}
4123#endif
4124