1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41#include <linux/kernel_stat.h>
42#include <linux/mm.h>
43#include <linux/hugetlb.h>
44#include <linux/mman.h>
45#include <linux/swap.h>
46#include <linux/highmem.h>
47#include <linux/pagemap.h>
48#include <linux/ksm.h>
49#include <linux/rmap.h>
50#include <linux/export.h>
51#include <linux/delayacct.h>
52#include <linux/init.h>
53#include <linux/writeback.h>
54#include <linux/memcontrol.h>
55#include <linux/mmu_notifier.h>
56#include <linux/kallsyms.h>
57#include <linux/swapops.h>
58#include <linux/elf.h>
59#include <linux/gfp.h>
60#include <linux/migrate.h>
61#include <linux/string.h>
62#include <linux/dma-debug.h>
63#include <linux/debugfs.h>
64#include <linux/userfaultfd_k.h>
65
66#include <asm/io.h>
67#include <asm/pgalloc.h>
68#include <asm/uaccess.h>
69#include <asm/tlb.h>
70#include <asm/tlbflush.h>
71#include <asm/pgtable.h>
72
73#include "internal.h"
74
75#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
76#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
77#endif
78
79#ifndef CONFIG_NEED_MULTIPLE_NODES
80
81unsigned long max_mapnr;
82struct page *mem_map;
83
84EXPORT_SYMBOL(max_mapnr);
85EXPORT_SYMBOL(mem_map);
86#endif
87
88
89
90
91
92
93
94
95void * high_memory;
96
97EXPORT_SYMBOL(high_memory);
98
99
100
101
102
103
104
105int randomize_va_space __read_mostly =
106#ifdef CONFIG_COMPAT_BRK
107 1;
108#else
109 2;
110#endif
111
112static int __init disable_randmaps(char *s)
113{
114 randomize_va_space = 0;
115 return 1;
116}
117__setup("norandmaps", disable_randmaps);
118
119unsigned long zero_pfn __read_mostly;
120unsigned long highest_memmap_pfn __read_mostly;
121
122EXPORT_SYMBOL(zero_pfn);
123
124
125
126
127static int __init init_zero_pfn(void)
128{
129 zero_pfn = page_to_pfn(ZERO_PAGE(0));
130 return 0;
131}
132core_initcall(init_zero_pfn);
133
134
135#if defined(SPLIT_RSS_COUNTING)
136
137void sync_mm_rss(struct mm_struct *mm)
138{
139 int i;
140
141 for (i = 0; i < NR_MM_COUNTERS; i++) {
142 if (current->rss_stat.count[i]) {
143 add_mm_counter(mm, i, current->rss_stat.count[i]);
144 current->rss_stat.count[i] = 0;
145 }
146 }
147 current->rss_stat.events = 0;
148}
149
150static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
151{
152 struct task_struct *task = current;
153
154 if (likely(task->mm == mm))
155 task->rss_stat.count[member] += val;
156 else
157 add_mm_counter(mm, member, val);
158}
159#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
160#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
161
162
163#define TASK_RSS_EVENTS_THRESH (64)
164static void check_sync_rss_stat(struct task_struct *task)
165{
166 if (unlikely(task != current))
167 return;
168 if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
169 sync_mm_rss(task->mm);
170}
171#else
172
173#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
174#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
175
176static void check_sync_rss_stat(struct task_struct *task)
177{
178}
179
180#endif
181
182#ifdef HAVE_GENERIC_MMU_GATHER
183
184static bool tlb_next_batch(struct mmu_gather *tlb)
185{
186 struct mmu_gather_batch *batch;
187
188 batch = tlb->active;
189 if (batch->next) {
190 tlb->active = batch->next;
191 return true;
192 }
193
194 if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
195 return false;
196
197 batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
198 if (!batch)
199 return false;
200
201 tlb->batch_count++;
202 batch->next = NULL;
203 batch->nr = 0;
204 batch->max = MAX_GATHER_BATCH;
205
206 tlb->active->next = batch;
207 tlb->active = batch;
208
209 return true;
210}
211
212
213
214
215
216
217void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end)
218{
219 tlb->mm = mm;
220
221
222 tlb->fullmm = !(start | (end+1));
223 tlb->need_flush_all = 0;
224 tlb->local.next = NULL;
225 tlb->local.nr = 0;
226 tlb->local.max = ARRAY_SIZE(tlb->__pages);
227 tlb->active = &tlb->local;
228 tlb->batch_count = 0;
229
230#ifdef CONFIG_HAVE_RCU_TABLE_FREE
231 tlb->batch = NULL;
232#endif
233
234 __tlb_reset_range(tlb);
235}
236
237static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
238{
239 if (!tlb->end)
240 return;
241
242 tlb_flush(tlb);
243 mmu_notifier_invalidate_range(tlb->mm, tlb->start, tlb->end);
244#ifdef CONFIG_HAVE_RCU_TABLE_FREE
245 tlb_table_flush(tlb);
246#endif
247 __tlb_reset_range(tlb);
248}
249
250static void tlb_flush_mmu_free(struct mmu_gather *tlb)
251{
252 struct mmu_gather_batch *batch;
253
254 for (batch = &tlb->local; batch && batch->nr; batch = batch->next) {
255 free_pages_and_swap_cache(batch->pages, batch->nr);
256 batch->nr = 0;
257 }
258 tlb->active = &tlb->local;
259}
260
261void tlb_flush_mmu(struct mmu_gather *tlb)
262{
263 tlb_flush_mmu_tlbonly(tlb);
264 tlb_flush_mmu_free(tlb);
265}
266
267
268
269
270
271void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
272{
273 struct mmu_gather_batch *batch, *next;
274
275 tlb_flush_mmu(tlb);
276
277
278 check_pgt_cache();
279
280 for (batch = tlb->local.next; batch; batch = next) {
281 next = batch->next;
282 free_pages((unsigned long)batch, 0);
283 }
284 tlb->local.next = NULL;
285}
286
287
288
289
290
291
292
293int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
294{
295 struct mmu_gather_batch *batch;
296
297 VM_BUG_ON(!tlb->end);
298
299 batch = tlb->active;
300 batch->pages[batch->nr++] = page;
301 if (batch->nr == batch->max) {
302 if (!tlb_next_batch(tlb))
303 return 0;
304 batch = tlb->active;
305 }
306 VM_BUG_ON_PAGE(batch->nr > batch->max, page);
307
308 return batch->max - batch->nr;
309}
310
311#endif
312
313#ifdef CONFIG_HAVE_RCU_TABLE_FREE
314
315
316
317
318
319static void tlb_remove_table_smp_sync(void *arg)
320{
321
322}
323
324static void tlb_remove_table_one(void *table)
325{
326
327
328
329
330
331
332
333 smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
334 __tlb_remove_table(table);
335}
336
337static void tlb_remove_table_rcu(struct rcu_head *head)
338{
339 struct mmu_table_batch *batch;
340 int i;
341
342 batch = container_of(head, struct mmu_table_batch, rcu);
343
344 for (i = 0; i < batch->nr; i++)
345 __tlb_remove_table(batch->tables[i]);
346
347 free_page((unsigned long)batch);
348}
349
350void tlb_table_flush(struct mmu_gather *tlb)
351{
352 struct mmu_table_batch **batch = &tlb->batch;
353
354 if (*batch) {
355 call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
356 *batch = NULL;
357 }
358}
359
360void tlb_remove_table(struct mmu_gather *tlb, void *table)
361{
362 struct mmu_table_batch **batch = &tlb->batch;
363
364
365
366
367
368 if (atomic_read(&tlb->mm->mm_users) < 2) {
369 __tlb_remove_table(table);
370 return;
371 }
372
373 if (*batch == NULL) {
374 *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
375 if (*batch == NULL) {
376 tlb_remove_table_one(table);
377 return;
378 }
379 (*batch)->nr = 0;
380 }
381 (*batch)->tables[(*batch)->nr++] = table;
382 if ((*batch)->nr == MAX_TABLE_BATCH)
383 tlb_table_flush(tlb);
384}
385
386#endif
387
388
389
390
391
392static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
393 unsigned long addr)
394{
395 pgtable_t token = pmd_pgtable(*pmd);
396 pmd_clear(pmd);
397 pte_free_tlb(tlb, token, addr);
398 atomic_long_dec(&tlb->mm->nr_ptes);
399}
400
401static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
402 unsigned long addr, unsigned long end,
403 unsigned long floor, unsigned long ceiling)
404{
405 pmd_t *pmd;
406 unsigned long next;
407 unsigned long start;
408
409 start = addr;
410 pmd = pmd_offset(pud, addr);
411 do {
412 next = pmd_addr_end(addr, end);
413 if (pmd_none_or_clear_bad(pmd))
414 continue;
415 free_pte_range(tlb, pmd, addr);
416 } while (pmd++, addr = next, addr != end);
417
418 start &= PUD_MASK;
419 if (start < floor)
420 return;
421 if (ceiling) {
422 ceiling &= PUD_MASK;
423 if (!ceiling)
424 return;
425 }
426 if (end - 1 > ceiling - 1)
427 return;
428
429 pmd = pmd_offset(pud, start);
430 pud_clear(pud);
431 pmd_free_tlb(tlb, pmd, start);
432 mm_dec_nr_pmds(tlb->mm);
433}
434
435static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
436 unsigned long addr, unsigned long end,
437 unsigned long floor, unsigned long ceiling)
438{
439 pud_t *pud;
440 unsigned long next;
441 unsigned long start;
442
443 start = addr;
444 pud = pud_offset(pgd, addr);
445 do {
446 next = pud_addr_end(addr, end);
447 if (pud_none_or_clear_bad(pud))
448 continue;
449 free_pmd_range(tlb, pud, addr, next, floor, ceiling);
450 } while (pud++, addr = next, addr != end);
451
452 start &= PGDIR_MASK;
453 if (start < floor)
454 return;
455 if (ceiling) {
456 ceiling &= PGDIR_MASK;
457 if (!ceiling)
458 return;
459 }
460 if (end - 1 > ceiling - 1)
461 return;
462
463 pud = pud_offset(pgd, start);
464 pgd_clear(pgd);
465 pud_free_tlb(tlb, pud, start);
466}
467
468
469
470
471void free_pgd_range(struct mmu_gather *tlb,
472 unsigned long addr, unsigned long end,
473 unsigned long floor, unsigned long ceiling)
474{
475 pgd_t *pgd;
476 unsigned long next;
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504 addr &= PMD_MASK;
505 if (addr < floor) {
506 addr += PMD_SIZE;
507 if (!addr)
508 return;
509 }
510 if (ceiling) {
511 ceiling &= PMD_MASK;
512 if (!ceiling)
513 return;
514 }
515 if (end - 1 > ceiling - 1)
516 end -= PMD_SIZE;
517 if (addr > end - 1)
518 return;
519
520 pgd = pgd_offset(tlb->mm, addr);
521 do {
522 next = pgd_addr_end(addr, end);
523 if (pgd_none_or_clear_bad(pgd))
524 continue;
525 free_pud_range(tlb, pgd, addr, next, floor, ceiling);
526 } while (pgd++, addr = next, addr != end);
527}
528
529void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
530 unsigned long floor, unsigned long ceiling)
531{
532 while (vma) {
533 struct vm_area_struct *next = vma->vm_next;
534 unsigned long addr = vma->vm_start;
535
536
537
538
539
540 unlink_anon_vmas(vma);
541 unlink_file_vma(vma);
542
543 if (is_vm_hugetlb_page(vma)) {
544 hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
545 floor, next? next->vm_start: ceiling);
546 } else {
547
548
549
550 while (next && next->vm_start <= vma->vm_end + PMD_SIZE
551 && !is_vm_hugetlb_page(next)) {
552 vma = next;
553 next = vma->vm_next;
554 unlink_anon_vmas(vma);
555 unlink_file_vma(vma);
556 }
557 free_pgd_range(tlb, addr, vma->vm_end,
558 floor, next? next->vm_start: ceiling);
559 }
560 vma = next;
561 }
562}
563
564int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
565 pmd_t *pmd, unsigned long address)
566{
567 spinlock_t *ptl;
568 pgtable_t new = pte_alloc_one(mm, address);
569 int wait_split_huge_page;
570 if (!new)
571 return -ENOMEM;
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586 smp_wmb();
587
588 ptl = pmd_lock(mm, pmd);
589 wait_split_huge_page = 0;
590 if (likely(pmd_none(*pmd))) {
591 atomic_long_inc(&mm->nr_ptes);
592 pmd_populate(mm, pmd, new);
593 new = NULL;
594 } else if (unlikely(pmd_trans_splitting(*pmd)))
595 wait_split_huge_page = 1;
596 spin_unlock(ptl);
597 if (new)
598 pte_free(mm, new);
599 if (wait_split_huge_page)
600 wait_split_huge_page(vma->anon_vma, pmd);
601 return 0;
602}
603
604int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
605{
606 pte_t *new = pte_alloc_one_kernel(&init_mm, address);
607 if (!new)
608 return -ENOMEM;
609
610 smp_wmb();
611
612 spin_lock(&init_mm.page_table_lock);
613 if (likely(pmd_none(*pmd))) {
614 pmd_populate_kernel(&init_mm, pmd, new);
615 new = NULL;
616 } else
617 VM_BUG_ON(pmd_trans_splitting(*pmd));
618 spin_unlock(&init_mm.page_table_lock);
619 if (new)
620 pte_free_kernel(&init_mm, new);
621 return 0;
622}
623
624static inline void init_rss_vec(int *rss)
625{
626 memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
627}
628
629static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
630{
631 int i;
632
633 if (current->mm == mm)
634 sync_mm_rss(mm);
635 for (i = 0; i < NR_MM_COUNTERS; i++)
636 if (rss[i])
637 add_mm_counter(mm, i, rss[i]);
638}
639
640
641
642
643
644
645
646
647static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
648 pte_t pte, struct page *page)
649{
650 pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
651 pud_t *pud = pud_offset(pgd, addr);
652 pmd_t *pmd = pmd_offset(pud, addr);
653 struct address_space *mapping;
654 pgoff_t index;
655 static unsigned long resume;
656 static unsigned long nr_shown;
657 static unsigned long nr_unshown;
658
659
660
661
662
663 if (nr_shown == 60) {
664 if (time_before(jiffies, resume)) {
665 nr_unshown++;
666 return;
667 }
668 if (nr_unshown) {
669 printk(KERN_ALERT
670 "BUG: Bad page map: %lu messages suppressed\n",
671 nr_unshown);
672 nr_unshown = 0;
673 }
674 nr_shown = 0;
675 }
676 if (nr_shown++ == 0)
677 resume = jiffies + 60 * HZ;
678
679 mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
680 index = linear_page_index(vma, addr);
681
682 printk(KERN_ALERT
683 "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
684 current->comm,
685 (long long)pte_val(pte), (long long)pmd_val(*pmd));
686 if (page)
687 dump_page(page, "bad pte");
688 printk(KERN_ALERT
689 "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
690 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
691
692
693
694 pr_alert("file:%pD fault:%pf mmap:%pf readpage:%pf\n",
695 vma->vm_file,
696 vma->vm_ops ? vma->vm_ops->fault : NULL,
697 vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
698 mapping ? mapping->a_ops->readpage : NULL);
699 dump_stack();
700 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
701}
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745#ifdef __HAVE_ARCH_PTE_SPECIAL
746# define HAVE_PTE_SPECIAL 1
747#else
748# define HAVE_PTE_SPECIAL 0
749#endif
750struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
751 pte_t pte)
752{
753 unsigned long pfn = pte_pfn(pte);
754
755 if (HAVE_PTE_SPECIAL) {
756 if (likely(!pte_special(pte)))
757 goto check_pfn;
758 if (vma->vm_ops && vma->vm_ops->find_special_page)
759 return vma->vm_ops->find_special_page(vma, addr);
760 if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
761 return NULL;
762 if (!is_zero_pfn(pfn))
763 print_bad_pte(vma, addr, pte, NULL);
764 return NULL;
765 }
766
767
768
769 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
770 if (vma->vm_flags & VM_MIXEDMAP) {
771 if (!pfn_valid(pfn))
772 return NULL;
773 goto out;
774 } else {
775 unsigned long off;
776 off = (addr - vma->vm_start) >> PAGE_SHIFT;
777 if (pfn == vma->vm_pgoff + off)
778 return NULL;
779 if (!is_cow_mapping(vma->vm_flags))
780 return NULL;
781 }
782 }
783
784 if (is_zero_pfn(pfn))
785 return NULL;
786check_pfn:
787 if (unlikely(pfn > highest_memmap_pfn)) {
788 print_bad_pte(vma, addr, pte, NULL);
789 return NULL;
790 }
791
792
793
794
795
796out:
797 return pfn_to_page(pfn);
798}
799
800
801
802
803
804
805
806static inline unsigned long
807copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
808 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
809 unsigned long addr, int *rss)
810{
811 unsigned long vm_flags = vma->vm_flags;
812 pte_t pte = *src_pte;
813 struct page *page;
814
815
816 if (unlikely(!pte_present(pte))) {
817 swp_entry_t entry = pte_to_swp_entry(pte);
818
819 if (likely(!non_swap_entry(entry))) {
820 if (swap_duplicate(entry) < 0)
821 return entry.val;
822
823
824 if (unlikely(list_empty(&dst_mm->mmlist))) {
825 spin_lock(&mmlist_lock);
826 if (list_empty(&dst_mm->mmlist))
827 list_add(&dst_mm->mmlist,
828 &src_mm->mmlist);
829 spin_unlock(&mmlist_lock);
830 }
831 rss[MM_SWAPENTS]++;
832 } else if (is_migration_entry(entry)) {
833 page = migration_entry_to_page(entry);
834
835 if (PageAnon(page))
836 rss[MM_ANONPAGES]++;
837 else
838 rss[MM_FILEPAGES]++;
839
840 if (is_write_migration_entry(entry) &&
841 is_cow_mapping(vm_flags)) {
842
843
844
845
846 make_migration_entry_read(&entry);
847 pte = swp_entry_to_pte(entry);
848 if (pte_swp_soft_dirty(*src_pte))
849 pte = pte_swp_mksoft_dirty(pte);
850 set_pte_at(src_mm, addr, src_pte, pte);
851 }
852 }
853 goto out_set_pte;
854 }
855
856
857
858
859
860 if (is_cow_mapping(vm_flags)) {
861 ptep_set_wrprotect(src_mm, addr, src_pte);
862 pte = pte_wrprotect(pte);
863 }
864
865
866
867
868
869 if (vm_flags & VM_SHARED)
870 pte = pte_mkclean(pte);
871 pte = pte_mkold(pte);
872
873 page = vm_normal_page(vma, addr, pte);
874 if (page) {
875 get_page(page);
876 page_dup_rmap(page);
877 if (PageAnon(page))
878 rss[MM_ANONPAGES]++;
879 else
880 rss[MM_FILEPAGES]++;
881 }
882
883out_set_pte:
884 set_pte_at(dst_mm, addr, dst_pte, pte);
885 return 0;
886}
887
888static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
889 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
890 unsigned long addr, unsigned long end)
891{
892 pte_t *orig_src_pte, *orig_dst_pte;
893 pte_t *src_pte, *dst_pte;
894 spinlock_t *src_ptl, *dst_ptl;
895 int progress = 0;
896 int rss[NR_MM_COUNTERS];
897 swp_entry_t entry = (swp_entry_t){0};
898
899again:
900 init_rss_vec(rss);
901
902 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
903 if (!dst_pte)
904 return -ENOMEM;
905 src_pte = pte_offset_map(src_pmd, addr);
906 src_ptl = pte_lockptr(src_mm, src_pmd);
907 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
908 orig_src_pte = src_pte;
909 orig_dst_pte = dst_pte;
910 arch_enter_lazy_mmu_mode();
911
912 do {
913
914
915
916
917 if (progress >= 32) {
918 progress = 0;
919 if (need_resched() ||
920 spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
921 break;
922 }
923 if (pte_none(*src_pte)) {
924 progress++;
925 continue;
926 }
927 entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
928 vma, addr, rss);
929 if (entry.val)
930 break;
931 progress += 8;
932 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
933
934 arch_leave_lazy_mmu_mode();
935 spin_unlock(src_ptl);
936 pte_unmap(orig_src_pte);
937 add_mm_rss_vec(dst_mm, rss);
938 pte_unmap_unlock(orig_dst_pte, dst_ptl);
939 cond_resched();
940
941 if (entry.val) {
942 if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
943 return -ENOMEM;
944 progress = 0;
945 }
946 if (addr != end)
947 goto again;
948 return 0;
949}
950
951static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
952 pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
953 unsigned long addr, unsigned long end)
954{
955 pmd_t *src_pmd, *dst_pmd;
956 unsigned long next;
957
958 dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
959 if (!dst_pmd)
960 return -ENOMEM;
961 src_pmd = pmd_offset(src_pud, addr);
962 do {
963 next = pmd_addr_end(addr, end);
964 if (pmd_trans_huge(*src_pmd)) {
965 int err;
966 VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);
967 err = copy_huge_pmd(dst_mm, src_mm,
968 dst_pmd, src_pmd, addr, vma);
969 if (err == -ENOMEM)
970 return -ENOMEM;
971 if (!err)
972 continue;
973
974 }
975 if (pmd_none_or_clear_bad(src_pmd))
976 continue;
977 if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
978 vma, addr, next))
979 return -ENOMEM;
980 } while (dst_pmd++, src_pmd++, addr = next, addr != end);
981 return 0;
982}
983
984static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
985 pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
986 unsigned long addr, unsigned long end)
987{
988 pud_t *src_pud, *dst_pud;
989 unsigned long next;
990
991 dst_pud = pud_alloc(dst_mm, dst_pgd, addr);
992 if (!dst_pud)
993 return -ENOMEM;
994 src_pud = pud_offset(src_pgd, addr);
995 do {
996 next = pud_addr_end(addr, end);
997 if (pud_none_or_clear_bad(src_pud))
998 continue;
999 if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
1000 vma, addr, next))
1001 return -ENOMEM;
1002 } while (dst_pud++, src_pud++, addr = next, addr != end);
1003 return 0;
1004}
1005
1006int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1007 struct vm_area_struct *vma)
1008{
1009 pgd_t *src_pgd, *dst_pgd;
1010 unsigned long next;
1011 unsigned long addr = vma->vm_start;
1012 unsigned long end = vma->vm_end;
1013 unsigned long mmun_start;
1014 unsigned long mmun_end;
1015 bool is_cow;
1016 int ret;
1017
1018
1019
1020
1021
1022
1023
1024 if (!(vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) &&
1025 !vma->anon_vma)
1026 return 0;
1027
1028 if (is_vm_hugetlb_page(vma))
1029 return copy_hugetlb_page_range(dst_mm, src_mm, vma);
1030
1031 if (unlikely(vma->vm_flags & VM_PFNMAP)) {
1032
1033
1034
1035
1036 ret = track_pfn_copy(vma);
1037 if (ret)
1038 return ret;
1039 }
1040
1041
1042
1043
1044
1045
1046
1047 is_cow = is_cow_mapping(vma->vm_flags);
1048 mmun_start = addr;
1049 mmun_end = end;
1050 if (is_cow)
1051 mmu_notifier_invalidate_range_start(src_mm, mmun_start,
1052 mmun_end);
1053
1054 ret = 0;
1055 dst_pgd = pgd_offset(dst_mm, addr);
1056 src_pgd = pgd_offset(src_mm, addr);
1057 do {
1058 next = pgd_addr_end(addr, end);
1059 if (pgd_none_or_clear_bad(src_pgd))
1060 continue;
1061 if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
1062 vma, addr, next))) {
1063 ret = -ENOMEM;
1064 break;
1065 }
1066 } while (dst_pgd++, src_pgd++, addr = next, addr != end);
1067
1068 if (is_cow)
1069 mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end);
1070 return ret;
1071}
1072
1073static unsigned long zap_pte_range(struct mmu_gather *tlb,
1074 struct vm_area_struct *vma, pmd_t *pmd,
1075 unsigned long addr, unsigned long end,
1076 struct zap_details *details)
1077{
1078 struct mm_struct *mm = tlb->mm;
1079 int force_flush = 0;
1080 int rss[NR_MM_COUNTERS];
1081 spinlock_t *ptl;
1082 pte_t *start_pte;
1083 pte_t *pte;
1084 swp_entry_t entry;
1085
1086again:
1087 init_rss_vec(rss);
1088 start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
1089 pte = start_pte;
1090 arch_enter_lazy_mmu_mode();
1091 do {
1092 pte_t ptent = *pte;
1093 if (pte_none(ptent)) {
1094 continue;
1095 }
1096
1097 if (pte_present(ptent)) {
1098 struct page *page;
1099
1100 page = vm_normal_page(vma, addr, ptent);
1101 if (unlikely(details) && page) {
1102
1103
1104
1105
1106
1107 if (details->check_mapping &&
1108 details->check_mapping != page->mapping)
1109 continue;
1110 }
1111 ptent = ptep_get_and_clear_full(mm, addr, pte,
1112 tlb->fullmm);
1113 tlb_remove_tlb_entry(tlb, pte, addr);
1114 if (unlikely(!page))
1115 continue;
1116 if (PageAnon(page))
1117 rss[MM_ANONPAGES]--;
1118 else {
1119 if (pte_dirty(ptent)) {
1120 force_flush = 1;
1121 set_page_dirty(page);
1122 }
1123 if (pte_young(ptent) &&
1124 likely(!(vma->vm_flags & VM_SEQ_READ)))
1125 mark_page_accessed(page);
1126 rss[MM_FILEPAGES]--;
1127 }
1128 page_remove_rmap(page);
1129 if (unlikely(page_mapcount(page) < 0))
1130 print_bad_pte(vma, addr, ptent, page);
1131 if (unlikely(!__tlb_remove_page(tlb, page))) {
1132 force_flush = 1;
1133 addr += PAGE_SIZE;
1134 break;
1135 }
1136 continue;
1137 }
1138
1139 if (unlikely(details))
1140 continue;
1141
1142 entry = pte_to_swp_entry(ptent);
1143 if (!non_swap_entry(entry))
1144 rss[MM_SWAPENTS]--;
1145 else if (is_migration_entry(entry)) {
1146 struct page *page;
1147
1148 page = migration_entry_to_page(entry);
1149
1150 if (PageAnon(page))
1151 rss[MM_ANONPAGES]--;
1152 else
1153 rss[MM_FILEPAGES]--;
1154 }
1155 if (unlikely(!free_swap_and_cache(entry)))
1156 print_bad_pte(vma, addr, ptent, NULL);
1157 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
1158 } while (pte++, addr += PAGE_SIZE, addr != end);
1159
1160 add_mm_rss_vec(mm, rss);
1161 arch_leave_lazy_mmu_mode();
1162
1163
1164 if (force_flush)
1165 tlb_flush_mmu_tlbonly(tlb);
1166 pte_unmap_unlock(start_pte, ptl);
1167
1168
1169
1170
1171
1172
1173
1174 if (force_flush) {
1175 force_flush = 0;
1176 tlb_flush_mmu_free(tlb);
1177
1178 if (addr != end)
1179 goto again;
1180 }
1181
1182 return addr;
1183}
1184
1185static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1186 struct vm_area_struct *vma, pud_t *pud,
1187 unsigned long addr, unsigned long end,
1188 struct zap_details *details)
1189{
1190 pmd_t *pmd;
1191 unsigned long next;
1192
1193 pmd = pmd_offset(pud, addr);
1194 do {
1195 next = pmd_addr_end(addr, end);
1196 if (pmd_trans_huge(*pmd)) {
1197 if (next - addr != HPAGE_PMD_SIZE) {
1198#ifdef CONFIG_DEBUG_VM
1199 if (!rwsem_is_locked(&tlb->mm->mmap_sem)) {
1200 pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n",
1201 __func__, addr, end,
1202 vma->vm_start,
1203 vma->vm_end);
1204 BUG();
1205 }
1206#endif
1207 split_huge_page_pmd(vma, addr, pmd);
1208 } else if (zap_huge_pmd(tlb, vma, pmd, addr))
1209 goto next;
1210
1211 }
1212
1213
1214
1215
1216
1217
1218
1219 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
1220 goto next;
1221 next = zap_pte_range(tlb, vma, pmd, addr, next, details);
1222next:
1223 cond_resched();
1224 } while (pmd++, addr = next, addr != end);
1225
1226 return addr;
1227}
1228
1229static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1230 struct vm_area_struct *vma, pgd_t *pgd,
1231 unsigned long addr, unsigned long end,
1232 struct zap_details *details)
1233{
1234 pud_t *pud;
1235 unsigned long next;
1236
1237 pud = pud_offset(pgd, addr);
1238 do {
1239 next = pud_addr_end(addr, end);
1240 if (pud_none_or_clear_bad(pud))
1241 continue;
1242 next = zap_pmd_range(tlb, vma, pud, addr, next, details);
1243 } while (pud++, addr = next, addr != end);
1244
1245 return addr;
1246}
1247
1248static void unmap_page_range(struct mmu_gather *tlb,
1249 struct vm_area_struct *vma,
1250 unsigned long addr, unsigned long end,
1251 struct zap_details *details)
1252{
1253 pgd_t *pgd;
1254 unsigned long next;
1255
1256 if (details && !details->check_mapping)
1257 details = NULL;
1258
1259 BUG_ON(addr >= end);
1260 tlb_start_vma(tlb, vma);
1261 pgd = pgd_offset(vma->vm_mm, addr);
1262 do {
1263 next = pgd_addr_end(addr, end);
1264 if (pgd_none_or_clear_bad(pgd))
1265 continue;
1266 next = zap_pud_range(tlb, vma, pgd, addr, next, details);
1267 } while (pgd++, addr = next, addr != end);
1268 tlb_end_vma(tlb, vma);
1269}
1270
1271
1272static void unmap_single_vma(struct mmu_gather *tlb,
1273 struct vm_area_struct *vma, unsigned long start_addr,
1274 unsigned long end_addr,
1275 struct zap_details *details)
1276{
1277 unsigned long start = max(vma->vm_start, start_addr);
1278 unsigned long end;
1279
1280 if (start >= vma->vm_end)
1281 return;
1282 end = min(vma->vm_end, end_addr);
1283 if (end <= vma->vm_start)
1284 return;
1285
1286 if (vma->vm_file)
1287 uprobe_munmap(vma, start, end);
1288
1289 if (unlikely(vma->vm_flags & VM_PFNMAP))
1290 untrack_pfn(vma, 0, 0);
1291
1292 if (start != end) {
1293 if (unlikely(is_vm_hugetlb_page(vma))) {
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305 if (vma->vm_file) {
1306 i_mmap_lock_write(vma->vm_file->f_mapping);
1307 __unmap_hugepage_range_final(tlb, vma, start, end, NULL);
1308 i_mmap_unlock_write(vma->vm_file->f_mapping);
1309 }
1310 } else
1311 unmap_page_range(tlb, vma, start, end, details);
1312 }
1313}
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333void unmap_vmas(struct mmu_gather *tlb,
1334 struct vm_area_struct *vma, unsigned long start_addr,
1335 unsigned long end_addr)
1336{
1337 struct mm_struct *mm = vma->vm_mm;
1338
1339 mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
1340 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
1341 unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
1342 mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
1343}
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354void zap_page_range(struct vm_area_struct *vma, unsigned long start,
1355 unsigned long size, struct zap_details *details)
1356{
1357 struct mm_struct *mm = vma->vm_mm;
1358 struct mmu_gather tlb;
1359 unsigned long end = start + size;
1360
1361 lru_add_drain();
1362 tlb_gather_mmu(&tlb, mm, start, end);
1363 update_hiwater_rss(mm);
1364 mmu_notifier_invalidate_range_start(mm, start, end);
1365 for ( ; vma && vma->vm_start < end; vma = vma->vm_next)
1366 unmap_single_vma(&tlb, vma, start, end, details);
1367 mmu_notifier_invalidate_range_end(mm, start, end);
1368 tlb_finish_mmu(&tlb, start, end);
1369}
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
1381 unsigned long size, struct zap_details *details)
1382{
1383 struct mm_struct *mm = vma->vm_mm;
1384 struct mmu_gather tlb;
1385 unsigned long end = address + size;
1386
1387 lru_add_drain();
1388 tlb_gather_mmu(&tlb, mm, address, end);
1389 update_hiwater_rss(mm);
1390 mmu_notifier_invalidate_range_start(mm, address, end);
1391 unmap_single_vma(&tlb, vma, address, end, details);
1392 mmu_notifier_invalidate_range_end(mm, address, end);
1393 tlb_finish_mmu(&tlb, address, end);
1394}
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
1409 unsigned long size)
1410{
1411 if (address < vma->vm_start || address + size > vma->vm_end ||
1412 !(vma->vm_flags & VM_PFNMAP))
1413 return -1;
1414 zap_page_range_single(vma, address, size, NULL);
1415 return 0;
1416}
1417EXPORT_SYMBOL_GPL(zap_vma_ptes);
1418
1419pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
1420 spinlock_t **ptl)
1421{
1422 pgd_t * pgd = pgd_offset(mm, addr);
1423 pud_t * pud = pud_alloc(mm, pgd, addr);
1424 if (pud) {
1425 pmd_t * pmd = pmd_alloc(mm, pud, addr);
1426 if (pmd) {
1427 VM_BUG_ON(pmd_trans_huge(*pmd));
1428 return pte_alloc_map_lock(mm, pmd, addr, ptl);
1429 }
1430 }
1431 return NULL;
1432}
1433
1434
1435
1436
1437
1438
1439
1440
1441static int insert_page(struct vm_area_struct *vma, unsigned long addr,
1442 struct page *page, pgprot_t prot)
1443{
1444 struct mm_struct *mm = vma->vm_mm;
1445 int retval;
1446 pte_t *pte;
1447 spinlock_t *ptl;
1448
1449 retval = -EINVAL;
1450 if (PageAnon(page))
1451 goto out;
1452 retval = -ENOMEM;
1453 flush_dcache_page(page);
1454 pte = get_locked_pte(mm, addr, &ptl);
1455 if (!pte)
1456 goto out;
1457 retval = -EBUSY;
1458 if (!pte_none(*pte))
1459 goto out_unlock;
1460
1461
1462 get_page(page);
1463 inc_mm_counter_fast(mm, MM_FILEPAGES);
1464 page_add_file_rmap(page);
1465 set_pte_at(mm, addr, pte, mk_pte(page, prot));
1466
1467 retval = 0;
1468 pte_unmap_unlock(pte, ptl);
1469 return retval;
1470out_unlock:
1471 pte_unmap_unlock(pte, ptl);
1472out:
1473 return retval;
1474}
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
1504 struct page *page)
1505{
1506 if (addr < vma->vm_start || addr >= vma->vm_end)
1507 return -EFAULT;
1508 if (!page_count(page))
1509 return -EINVAL;
1510 if (!(vma->vm_flags & VM_MIXEDMAP)) {
1511 BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem));
1512 BUG_ON(vma->vm_flags & VM_PFNMAP);
1513 vma->vm_flags |= VM_MIXEDMAP;
1514 }
1515 return insert_page(vma, addr, page, vma->vm_page_prot);
1516}
1517EXPORT_SYMBOL(vm_insert_page);
1518
1519static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1520 unsigned long pfn, pgprot_t prot)
1521{
1522 struct mm_struct *mm = vma->vm_mm;
1523 int retval;
1524 pte_t *pte, entry;
1525 spinlock_t *ptl;
1526
1527 retval = -ENOMEM;
1528 pte = get_locked_pte(mm, addr, &ptl);
1529 if (!pte)
1530 goto out;
1531 retval = -EBUSY;
1532 if (!pte_none(*pte))
1533 goto out_unlock;
1534
1535
1536 entry = pte_mkspecial(pfn_pte(pfn, prot));
1537 set_pte_at(mm, addr, pte, entry);
1538 update_mmu_cache(vma, addr, pte);
1539
1540 retval = 0;
1541out_unlock:
1542 pte_unmap_unlock(pte, ptl);
1543out:
1544 return retval;
1545}
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1565 unsigned long pfn)
1566{
1567 int ret;
1568 pgprot_t pgprot = vma->vm_page_prot;
1569
1570
1571
1572
1573
1574
1575 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
1576 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
1577 (VM_PFNMAP|VM_MIXEDMAP));
1578 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
1579 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
1580
1581 if (addr < vma->vm_start || addr >= vma->vm_end)
1582 return -EFAULT;
1583 if (track_pfn_insert(vma, &pgprot, pfn))
1584 return -EINVAL;
1585
1586 ret = insert_pfn(vma, addr, pfn, pgprot);
1587
1588 return ret;
1589}
1590EXPORT_SYMBOL(vm_insert_pfn);
1591
1592int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
1593 unsigned long pfn)
1594{
1595 BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
1596
1597 if (addr < vma->vm_start || addr >= vma->vm_end)
1598 return -EFAULT;
1599
1600
1601
1602
1603
1604
1605
1606
1607 if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) {
1608 struct page *page;
1609
1610 page = pfn_to_page(pfn);
1611 return insert_page(vma, addr, page, vma->vm_page_prot);
1612 }
1613 return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
1614}
1615EXPORT_SYMBOL(vm_insert_mixed);
1616
1617
1618
1619
1620
1621
1622static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1623 unsigned long addr, unsigned long end,
1624 unsigned long pfn, pgprot_t prot)
1625{
1626 pte_t *pte;
1627 spinlock_t *ptl;
1628
1629 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
1630 if (!pte)
1631 return -ENOMEM;
1632 arch_enter_lazy_mmu_mode();
1633 do {
1634 BUG_ON(!pte_none(*pte));
1635 set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
1636 pfn++;
1637 } while (pte++, addr += PAGE_SIZE, addr != end);
1638 arch_leave_lazy_mmu_mode();
1639 pte_unmap_unlock(pte - 1, ptl);
1640 return 0;
1641}
1642
1643static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
1644 unsigned long addr, unsigned long end,
1645 unsigned long pfn, pgprot_t prot)
1646{
1647 pmd_t *pmd;
1648 unsigned long next;
1649
1650 pfn -= addr >> PAGE_SHIFT;
1651 pmd = pmd_alloc(mm, pud, addr);
1652 if (!pmd)
1653 return -ENOMEM;
1654 VM_BUG_ON(pmd_trans_huge(*pmd));
1655 do {
1656 next = pmd_addr_end(addr, end);
1657 if (remap_pte_range(mm, pmd, addr, next,
1658 pfn + (addr >> PAGE_SHIFT), prot))
1659 return -ENOMEM;
1660 } while (pmd++, addr = next, addr != end);
1661 return 0;
1662}
1663
1664static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
1665 unsigned long addr, unsigned long end,
1666 unsigned long pfn, pgprot_t prot)
1667{
1668 pud_t *pud;
1669 unsigned long next;
1670
1671 pfn -= addr >> PAGE_SHIFT;
1672 pud = pud_alloc(mm, pgd, addr);
1673 if (!pud)
1674 return -ENOMEM;
1675 do {
1676 next = pud_addr_end(addr, end);
1677 if (remap_pmd_range(mm, pud, addr, next,
1678 pfn + (addr >> PAGE_SHIFT), prot))
1679 return -ENOMEM;
1680 } while (pud++, addr = next, addr != end);
1681 return 0;
1682}
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1695 unsigned long pfn, unsigned long size, pgprot_t prot)
1696{
1697 pgd_t *pgd;
1698 unsigned long next;
1699 unsigned long end = addr + PAGE_ALIGN(size);
1700 struct mm_struct *mm = vma->vm_mm;
1701 int err;
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721 if (is_cow_mapping(vma->vm_flags)) {
1722 if (addr != vma->vm_start || end != vma->vm_end)
1723 return -EINVAL;
1724 vma->vm_pgoff = pfn;
1725 }
1726
1727 err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));
1728 if (err)
1729 return -EINVAL;
1730
1731 vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
1732
1733 BUG_ON(addr >= end);
1734 pfn -= addr >> PAGE_SHIFT;
1735 pgd = pgd_offset(mm, addr);
1736 flush_cache_range(vma, addr, end);
1737 do {
1738 next = pgd_addr_end(addr, end);
1739 err = remap_pud_range(mm, pgd, addr, next,
1740 pfn + (addr >> PAGE_SHIFT), prot);
1741 if (err)
1742 break;
1743 } while (pgd++, addr = next, addr != end);
1744
1745 if (err)
1746 untrack_pfn(vma, pfn, PAGE_ALIGN(size));
1747
1748 return err;
1749}
1750EXPORT_SYMBOL(remap_pfn_range);
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
1766{
1767 unsigned long vm_len, pfn, pages;
1768
1769
1770 if (start + len < start)
1771 return -EINVAL;
1772
1773
1774
1775
1776
1777 len += start & ~PAGE_MASK;
1778 pfn = start >> PAGE_SHIFT;
1779 pages = (len + ~PAGE_MASK) >> PAGE_SHIFT;
1780 if (pfn + pages < pfn)
1781 return -EINVAL;
1782
1783
1784 if (vma->vm_pgoff > pages)
1785 return -EINVAL;
1786 pfn += vma->vm_pgoff;
1787 pages -= vma->vm_pgoff;
1788
1789
1790 vm_len = vma->vm_end - vma->vm_start;
1791 if (vm_len >> PAGE_SHIFT > pages)
1792 return -EINVAL;
1793
1794
1795 return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
1796}
1797EXPORT_SYMBOL(vm_iomap_memory);
1798
1799static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
1800 unsigned long addr, unsigned long end,
1801 pte_fn_t fn, void *data)
1802{
1803 pte_t *pte;
1804 int err;
1805 pgtable_t token;
1806 spinlock_t *uninitialized_var(ptl);
1807
1808 pte = (mm == &init_mm) ?
1809 pte_alloc_kernel(pmd, addr) :
1810 pte_alloc_map_lock(mm, pmd, addr, &ptl);
1811 if (!pte)
1812 return -ENOMEM;
1813
1814 BUG_ON(pmd_huge(*pmd));
1815
1816 arch_enter_lazy_mmu_mode();
1817
1818 token = pmd_pgtable(*pmd);
1819
1820 do {
1821 err = fn(pte++, token, addr, data);
1822 if (err)
1823 break;
1824 } while (addr += PAGE_SIZE, addr != end);
1825
1826 arch_leave_lazy_mmu_mode();
1827
1828 if (mm != &init_mm)
1829 pte_unmap_unlock(pte-1, ptl);
1830 return err;
1831}
1832
1833static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
1834 unsigned long addr, unsigned long end,
1835 pte_fn_t fn, void *data)
1836{
1837 pmd_t *pmd;
1838 unsigned long next;
1839 int err;
1840
1841 BUG_ON(pud_huge(*pud));
1842
1843 pmd = pmd_alloc(mm, pud, addr);
1844 if (!pmd)
1845 return -ENOMEM;
1846 do {
1847 next = pmd_addr_end(addr, end);
1848 err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
1849 if (err)
1850 break;
1851 } while (pmd++, addr = next, addr != end);
1852 return err;
1853}
1854
1855static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd,
1856 unsigned long addr, unsigned long end,
1857 pte_fn_t fn, void *data)
1858{
1859 pud_t *pud;
1860 unsigned long next;
1861 int err;
1862
1863 pud = pud_alloc(mm, pgd, addr);
1864 if (!pud)
1865 return -ENOMEM;
1866 do {
1867 next = pud_addr_end(addr, end);
1868 err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
1869 if (err)
1870 break;
1871 } while (pud++, addr = next, addr != end);
1872 return err;
1873}
1874
1875
1876
1877
1878
1879int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
1880 unsigned long size, pte_fn_t fn, void *data)
1881{
1882 pgd_t *pgd;
1883 unsigned long next;
1884 unsigned long end = addr + size;
1885 int err;
1886
1887 BUG_ON(addr >= end);
1888 pgd = pgd_offset(mm, addr);
1889 do {
1890 next = pgd_addr_end(addr, end);
1891 err = apply_to_pud_range(mm, pgd, addr, next, fn, data);
1892 if (err)
1893 break;
1894 } while (pgd++, addr = next, addr != end);
1895
1896 return err;
1897}
1898EXPORT_SYMBOL_GPL(apply_to_page_range);
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
1909 pte_t *page_table, pte_t orig_pte)
1910{
1911 int same = 1;
1912#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
1913 if (sizeof(pte_t) > sizeof(unsigned long)) {
1914 spinlock_t *ptl = pte_lockptr(mm, pmd);
1915 spin_lock(ptl);
1916 same = pte_same(*page_table, orig_pte);
1917 spin_unlock(ptl);
1918 }
1919#endif
1920 pte_unmap(page_table);
1921 return same;
1922}
1923
1924static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
1925{
1926 debug_dma_assert_idle(src);
1927
1928
1929
1930
1931
1932
1933
1934 if (unlikely(!src)) {
1935 void *kaddr = kmap_atomic(dst);
1936 void __user *uaddr = (void __user *)(va & PAGE_MASK);
1937
1938
1939
1940
1941
1942
1943
1944 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
1945 clear_page(kaddr);
1946 kunmap_atomic(kaddr);
1947 flush_dcache_page(dst);
1948 } else
1949 copy_user_highpage(dst, src, va, vma);
1950}
1951
1952
1953
1954
1955
1956
1957
1958static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
1959 unsigned long address)
1960{
1961 struct vm_fault vmf;
1962 int ret;
1963
1964 vmf.virtual_address = (void __user *)(address & PAGE_MASK);
1965 vmf.pgoff = page->index;
1966 vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
1967 vmf.page = page;
1968 vmf.cow_page = NULL;
1969
1970 ret = vma->vm_ops->page_mkwrite(vma, &vmf);
1971 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
1972 return ret;
1973 if (unlikely(!(ret & VM_FAULT_LOCKED))) {
1974 lock_page(page);
1975 if (!page->mapping) {
1976 unlock_page(page);
1977 return 0;
1978 }
1979 ret |= VM_FAULT_LOCKED;
1980 } else
1981 VM_BUG_ON_PAGE(!PageLocked(page), page);
1982 return ret;
1983}
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993static inline int wp_page_reuse(struct mm_struct *mm,
1994 struct vm_area_struct *vma, unsigned long address,
1995 pte_t *page_table, spinlock_t *ptl, pte_t orig_pte,
1996 struct page *page, int page_mkwrite,
1997 int dirty_shared)
1998 __releases(ptl)
1999{
2000 pte_t entry;
2001
2002
2003
2004
2005
2006 if (page)
2007 page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);
2008
2009 flush_cache_page(vma, address, pte_pfn(orig_pte));
2010 entry = pte_mkyoung(orig_pte);
2011 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2012 if (ptep_set_access_flags(vma, address, page_table, entry, 1))
2013 update_mmu_cache(vma, address, page_table);
2014 pte_unmap_unlock(page_table, ptl);
2015
2016 if (dirty_shared) {
2017 struct address_space *mapping;
2018 int dirtied;
2019
2020 if (!page_mkwrite)
2021 lock_page(page);
2022
2023 dirtied = set_page_dirty(page);
2024 VM_BUG_ON_PAGE(PageAnon(page), page);
2025 mapping = page->mapping;
2026 unlock_page(page);
2027 page_cache_release(page);
2028
2029 if ((dirtied || page_mkwrite) && mapping) {
2030
2031
2032
2033
2034 balance_dirty_pages_ratelimited(mapping);
2035 }
2036
2037 if (!page_mkwrite)
2038 file_update_time(vma->vm_file);
2039 }
2040
2041 return VM_FAULT_WRITE;
2042}
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
2061 unsigned long address, pte_t *page_table, pmd_t *pmd,
2062 pte_t orig_pte, struct page *old_page)
2063{
2064 struct page *new_page = NULL;
2065 spinlock_t *ptl = NULL;
2066 pte_t entry;
2067 int page_copied = 0;
2068 const unsigned long mmun_start = address & PAGE_MASK;
2069 const unsigned long mmun_end = mmun_start + PAGE_SIZE;
2070 struct mem_cgroup *memcg;
2071
2072 if (unlikely(anon_vma_prepare(vma)))
2073 goto oom;
2074
2075 if (is_zero_pfn(pte_pfn(orig_pte))) {
2076 new_page = alloc_zeroed_user_highpage_movable(vma, address);
2077 if (!new_page)
2078 goto oom;
2079 } else {
2080 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
2081 if (!new_page)
2082 goto oom;
2083 cow_user_page(new_page, old_page, address, vma);
2084 }
2085
2086 if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg))
2087 goto oom_free_new;
2088
2089 __SetPageUptodate(new_page);
2090
2091 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2092
2093
2094
2095
2096 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2097 if (likely(pte_same(*page_table, orig_pte))) {
2098 if (old_page) {
2099 if (!PageAnon(old_page)) {
2100 dec_mm_counter_fast(mm, MM_FILEPAGES);
2101 inc_mm_counter_fast(mm, MM_ANONPAGES);
2102 }
2103 } else {
2104 inc_mm_counter_fast(mm, MM_ANONPAGES);
2105 }
2106 flush_cache_page(vma, address, pte_pfn(orig_pte));
2107 entry = mk_pte(new_page, vma->vm_page_prot);
2108 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2109
2110
2111
2112
2113
2114
2115 ptep_clear_flush_notify(vma, address, page_table);
2116 page_add_new_anon_rmap(new_page, vma, address);
2117 mem_cgroup_commit_charge(new_page, memcg, false);
2118 lru_cache_add_active_or_unevictable(new_page, vma);
2119
2120
2121
2122
2123
2124 set_pte_at_notify(mm, address, page_table, entry);
2125 update_mmu_cache(vma, address, page_table);
2126 if (old_page) {
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149 page_remove_rmap(old_page);
2150 }
2151
2152
2153 new_page = old_page;
2154 page_copied = 1;
2155 } else {
2156 mem_cgroup_cancel_charge(new_page, memcg);
2157 }
2158
2159 if (new_page)
2160 page_cache_release(new_page);
2161
2162 pte_unmap_unlock(page_table, ptl);
2163 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2164 if (old_page) {
2165
2166
2167
2168
2169 if (page_copied && (vma->vm_flags & VM_LOCKED)) {
2170 lock_page(old_page);
2171 munlock_vma_page(old_page);
2172 unlock_page(old_page);
2173 }
2174 page_cache_release(old_page);
2175 }
2176 return page_copied ? VM_FAULT_WRITE : 0;
2177oom_free_new:
2178 page_cache_release(new_page);
2179oom:
2180 if (old_page)
2181 page_cache_release(old_page);
2182 return VM_FAULT_OOM;
2183}
2184
2185
2186
2187
2188
2189static int wp_pfn_shared(struct mm_struct *mm,
2190 struct vm_area_struct *vma, unsigned long address,
2191 pte_t *page_table, spinlock_t *ptl, pte_t orig_pte,
2192 pmd_t *pmd)
2193{
2194 if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
2195 struct vm_fault vmf = {
2196 .page = NULL,
2197 .pgoff = linear_page_index(vma, address),
2198 .virtual_address = (void __user *)(address & PAGE_MASK),
2199 .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE,
2200 };
2201 int ret;
2202
2203 pte_unmap_unlock(page_table, ptl);
2204 ret = vma->vm_ops->pfn_mkwrite(vma, &vmf);
2205 if (ret & VM_FAULT_ERROR)
2206 return ret;
2207 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2208
2209
2210
2211
2212 if (!pte_same(*page_table, orig_pte)) {
2213 pte_unmap_unlock(page_table, ptl);
2214 return 0;
2215 }
2216 }
2217 return wp_page_reuse(mm, vma, address, page_table, ptl, orig_pte,
2218 NULL, 0, 0);
2219}
2220
2221static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma,
2222 unsigned long address, pte_t *page_table,
2223 pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte,
2224 struct page *old_page)
2225 __releases(ptl)
2226{
2227 int page_mkwrite = 0;
2228
2229 page_cache_get(old_page);
2230
2231
2232
2233
2234
2235
2236 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
2237 int tmp;
2238
2239 pte_unmap_unlock(page_table, ptl);
2240 tmp = do_page_mkwrite(vma, old_page, address);
2241 if (unlikely(!tmp || (tmp &
2242 (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
2243 page_cache_release(old_page);
2244 return tmp;
2245 }
2246
2247
2248
2249
2250
2251
2252 page_table = pte_offset_map_lock(mm, pmd, address,
2253 &ptl);
2254 if (!pte_same(*page_table, orig_pte)) {
2255 unlock_page(old_page);
2256 pte_unmap_unlock(page_table, ptl);
2257 page_cache_release(old_page);
2258 return 0;
2259 }
2260 page_mkwrite = 1;
2261 }
2262
2263 return wp_page_reuse(mm, vma, address, page_table, ptl,
2264 orig_pte, old_page, page_mkwrite, 1);
2265}
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2286 unsigned long address, pte_t *page_table, pmd_t *pmd,
2287 spinlock_t *ptl, pte_t orig_pte)
2288 __releases(ptl)
2289{
2290 struct page *old_page;
2291
2292 old_page = vm_normal_page(vma, address, orig_pte);
2293 if (!old_page) {
2294
2295
2296
2297
2298
2299
2300
2301 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2302 (VM_WRITE|VM_SHARED))
2303 return wp_pfn_shared(mm, vma, address, page_table, ptl,
2304 orig_pte, pmd);
2305
2306 pte_unmap_unlock(page_table, ptl);
2307 return wp_page_copy(mm, vma, address, page_table, pmd,
2308 orig_pte, old_page);
2309 }
2310
2311
2312
2313
2314
2315 if (PageAnon(old_page) && !PageKsm(old_page)) {
2316 if (!trylock_page(old_page)) {
2317 page_cache_get(old_page);
2318 pte_unmap_unlock(page_table, ptl);
2319 lock_page(old_page);
2320 page_table = pte_offset_map_lock(mm, pmd, address,
2321 &ptl);
2322 if (!pte_same(*page_table, orig_pte)) {
2323 unlock_page(old_page);
2324 pte_unmap_unlock(page_table, ptl);
2325 page_cache_release(old_page);
2326 return 0;
2327 }
2328 page_cache_release(old_page);
2329 }
2330 if (reuse_swap_page(old_page)) {
2331
2332
2333
2334
2335
2336 page_move_anon_rmap(old_page, vma, address);
2337 unlock_page(old_page);
2338 return wp_page_reuse(mm, vma, address, page_table, ptl,
2339 orig_pte, old_page, 0, 0);
2340 }
2341 unlock_page(old_page);
2342 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2343 (VM_WRITE|VM_SHARED))) {
2344 return wp_page_shared(mm, vma, address, page_table, pmd,
2345 ptl, orig_pte, old_page);
2346 }
2347
2348
2349
2350
2351 page_cache_get(old_page);
2352
2353 pte_unmap_unlock(page_table, ptl);
2354 return wp_page_copy(mm, vma, address, page_table, pmd,
2355 orig_pte, old_page);
2356}
2357
2358static void unmap_mapping_range_vma(struct vm_area_struct *vma,
2359 unsigned long start_addr, unsigned long end_addr,
2360 struct zap_details *details)
2361{
2362 zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
2363}
2364
2365static inline void unmap_mapping_range_tree(struct rb_root *root,
2366 struct zap_details *details)
2367{
2368 struct vm_area_struct *vma;
2369 pgoff_t vba, vea, zba, zea;
2370
2371 vma_interval_tree_foreach(vma, root,
2372 details->first_index, details->last_index) {
2373
2374 vba = vma->vm_pgoff;
2375 vea = vba + vma_pages(vma) - 1;
2376
2377 zba = details->first_index;
2378 if (zba < vba)
2379 zba = vba;
2380 zea = details->last_index;
2381 if (zea > vea)
2382 zea = vea;
2383
2384 unmap_mapping_range_vma(vma,
2385 ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
2386 ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
2387 details);
2388 }
2389}
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408void unmap_mapping_range(struct address_space *mapping,
2409 loff_t const holebegin, loff_t const holelen, int even_cows)
2410{
2411 struct zap_details details;
2412 pgoff_t hba = holebegin >> PAGE_SHIFT;
2413 pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2414
2415
2416 if (sizeof(holelen) > sizeof(hlen)) {
2417 long long holeend =
2418 (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2419 if (holeend & ~(long long)ULONG_MAX)
2420 hlen = ULONG_MAX - hba + 1;
2421 }
2422
2423 details.check_mapping = even_cows? NULL: mapping;
2424 details.first_index = hba;
2425 details.last_index = hba + hlen - 1;
2426 if (details.last_index < details.first_index)
2427 details.last_index = ULONG_MAX;
2428
2429
2430
2431 i_mmap_lock_write(mapping);
2432 if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
2433 unmap_mapping_range_tree(&mapping->i_mmap, &details);
2434 i_mmap_unlock_write(mapping);
2435}
2436EXPORT_SYMBOL(unmap_mapping_range);
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2447 unsigned long address, pte_t *page_table, pmd_t *pmd,
2448 unsigned int flags, pte_t orig_pte)
2449{
2450 spinlock_t *ptl;
2451 struct page *page, *swapcache;
2452 struct mem_cgroup *memcg;
2453 swp_entry_t entry;
2454 pte_t pte;
2455 int locked;
2456 int exclusive = 0;
2457 int ret = 0;
2458
2459 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
2460 goto out;
2461
2462 entry = pte_to_swp_entry(orig_pte);
2463 if (unlikely(non_swap_entry(entry))) {
2464 if (is_migration_entry(entry)) {
2465 migration_entry_wait(mm, pmd, address);
2466 } else if (is_hwpoison_entry(entry)) {
2467 ret = VM_FAULT_HWPOISON;
2468 } else {
2469 print_bad_pte(vma, address, orig_pte, NULL);
2470 ret = VM_FAULT_SIGBUS;
2471 }
2472 goto out;
2473 }
2474 delayacct_set_flag(DELAYACCT_PF_SWAPIN);
2475 page = lookup_swap_cache(entry);
2476 if (!page) {
2477 page = swapin_readahead(entry,
2478 GFP_HIGHUSER_MOVABLE, vma, address);
2479 if (!page) {
2480
2481
2482
2483
2484 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2485 if (likely(pte_same(*page_table, orig_pte)))
2486 ret = VM_FAULT_OOM;
2487 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2488 goto unlock;
2489 }
2490
2491
2492 ret = VM_FAULT_MAJOR;
2493 count_vm_event(PGMAJFAULT);
2494 mem_cgroup_count_vm_event(mm, PGMAJFAULT);
2495 } else if (PageHWPoison(page)) {
2496
2497
2498
2499
2500 ret = VM_FAULT_HWPOISON;
2501 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2502 swapcache = page;
2503 goto out_release;
2504 }
2505
2506 swapcache = page;
2507 locked = lock_page_or_retry(page, mm, flags);
2508
2509 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2510 if (!locked) {
2511 ret |= VM_FAULT_RETRY;
2512 goto out_release;
2513 }
2514
2515
2516
2517
2518
2519
2520
2521 if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
2522 goto out_page;
2523
2524 page = ksm_might_need_to_copy(page, vma, address);
2525 if (unlikely(!page)) {
2526 ret = VM_FAULT_OOM;
2527 page = swapcache;
2528 goto out_page;
2529 }
2530
2531 if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) {
2532 ret = VM_FAULT_OOM;
2533 goto out_page;
2534 }
2535
2536
2537
2538
2539 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2540 if (unlikely(!pte_same(*page_table, orig_pte)))
2541 goto out_nomap;
2542
2543 if (unlikely(!PageUptodate(page))) {
2544 ret = VM_FAULT_SIGBUS;
2545 goto out_nomap;
2546 }
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558 inc_mm_counter_fast(mm, MM_ANONPAGES);
2559 dec_mm_counter_fast(mm, MM_SWAPENTS);
2560 pte = mk_pte(page, vma->vm_page_prot);
2561 if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
2562 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
2563 flags &= ~FAULT_FLAG_WRITE;
2564 ret |= VM_FAULT_WRITE;
2565 exclusive = 1;
2566 }
2567 flush_icache_page(vma, page);
2568 if (pte_swp_soft_dirty(orig_pte))
2569 pte = pte_mksoft_dirty(pte);
2570 set_pte_at(mm, address, page_table, pte);
2571 if (page == swapcache) {
2572 do_page_add_anon_rmap(page, vma, address, exclusive);
2573 mem_cgroup_commit_charge(page, memcg, true);
2574 } else {
2575 page_add_new_anon_rmap(page, vma, address);
2576 mem_cgroup_commit_charge(page, memcg, false);
2577 lru_cache_add_active_or_unevictable(page, vma);
2578 }
2579
2580 swap_free(entry);
2581 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
2582 try_to_free_swap(page);
2583 unlock_page(page);
2584 if (page != swapcache) {
2585
2586
2587
2588
2589
2590
2591
2592
2593 unlock_page(swapcache);
2594 page_cache_release(swapcache);
2595 }
2596
2597 if (flags & FAULT_FLAG_WRITE) {
2598 ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
2599 if (ret & VM_FAULT_ERROR)
2600 ret &= VM_FAULT_ERROR;
2601 goto out;
2602 }
2603
2604
2605 update_mmu_cache(vma, address, page_table);
2606unlock:
2607 pte_unmap_unlock(page_table, ptl);
2608out:
2609 return ret;
2610out_nomap:
2611 mem_cgroup_cancel_charge(page, memcg);
2612 pte_unmap_unlock(page_table, ptl);
2613out_page:
2614 unlock_page(page);
2615out_release:
2616 page_cache_release(page);
2617 if (page != swapcache) {
2618 unlock_page(swapcache);
2619 page_cache_release(swapcache);
2620 }
2621 return ret;
2622}
2623
2624
2625
2626
2627
2628
2629static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address)
2630{
2631 address &= PAGE_MASK;
2632 if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) {
2633 struct vm_area_struct *prev = vma->vm_prev;
2634
2635
2636
2637
2638
2639
2640
2641 if (prev && prev->vm_end == address)
2642 return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM;
2643
2644 return expand_downwards(vma, address - PAGE_SIZE);
2645 }
2646 if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) {
2647 struct vm_area_struct *next = vma->vm_next;
2648
2649
2650 if (next && next->vm_start == address + PAGE_SIZE)
2651 return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM;
2652
2653 return expand_upwards(vma, address + PAGE_SIZE);
2654 }
2655 return 0;
2656}
2657
2658
2659
2660
2661
2662
2663static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2664 unsigned long address, pte_t *page_table, pmd_t *pmd,
2665 unsigned int flags)
2666{
2667 struct mem_cgroup *memcg;
2668 struct page *page;
2669 spinlock_t *ptl;
2670 pte_t entry;
2671
2672 pte_unmap(page_table);
2673
2674
2675 if (vma->vm_flags & VM_SHARED)
2676 return VM_FAULT_SIGBUS;
2677
2678
2679 if (check_stack_guard_page(vma, address) < 0)
2680 return VM_FAULT_SIGSEGV;
2681
2682
2683 if (!(flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(mm)) {
2684 entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
2685 vma->vm_page_prot));
2686 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2687 if (!pte_none(*page_table))
2688 goto unlock;
2689
2690 if (userfaultfd_missing(vma)) {
2691 pte_unmap_unlock(page_table, ptl);
2692 return handle_userfault(vma, address, flags,
2693 VM_UFFD_MISSING);
2694 }
2695 goto setpte;
2696 }
2697
2698
2699 if (unlikely(anon_vma_prepare(vma)))
2700 goto oom;
2701 page = alloc_zeroed_user_highpage_movable(vma, address);
2702 if (!page)
2703 goto oom;
2704
2705 if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg))
2706 goto oom_free_page;
2707
2708
2709
2710
2711
2712
2713 __SetPageUptodate(page);
2714
2715 entry = mk_pte(page, vma->vm_page_prot);
2716 if (vma->vm_flags & VM_WRITE)
2717 entry = pte_mkwrite(pte_mkdirty(entry));
2718
2719 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2720 if (!pte_none(*page_table))
2721 goto release;
2722
2723
2724 if (userfaultfd_missing(vma)) {
2725 pte_unmap_unlock(page_table, ptl);
2726 mem_cgroup_cancel_charge(page, memcg);
2727 page_cache_release(page);
2728 return handle_userfault(vma, address, flags,
2729 VM_UFFD_MISSING);
2730 }
2731
2732 inc_mm_counter_fast(mm, MM_ANONPAGES);
2733 page_add_new_anon_rmap(page, vma, address);
2734 mem_cgroup_commit_charge(page, memcg, false);
2735 lru_cache_add_active_or_unevictable(page, vma);
2736setpte:
2737 set_pte_at(mm, address, page_table, entry);
2738
2739
2740 update_mmu_cache(vma, address, page_table);
2741unlock:
2742 pte_unmap_unlock(page_table, ptl);
2743 return 0;
2744release:
2745 mem_cgroup_cancel_charge(page, memcg);
2746 page_cache_release(page);
2747 goto unlock;
2748oom_free_page:
2749 page_cache_release(page);
2750oom:
2751 return VM_FAULT_OOM;
2752}
2753
2754
2755
2756
2757
2758
2759static int __do_fault(struct vm_area_struct *vma, unsigned long address,
2760 pgoff_t pgoff, unsigned int flags,
2761 struct page *cow_page, struct page **page)
2762{
2763 struct vm_fault vmf;
2764 int ret;
2765
2766 vmf.virtual_address = (void __user *)(address & PAGE_MASK);
2767 vmf.pgoff = pgoff;
2768 vmf.flags = flags;
2769 vmf.page = NULL;
2770 vmf.cow_page = cow_page;
2771
2772 ret = vma->vm_ops->fault(vma, &vmf);
2773 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
2774 return ret;
2775 if (!vmf.page)
2776 goto out;
2777
2778 if (unlikely(PageHWPoison(vmf.page))) {
2779 if (ret & VM_FAULT_LOCKED)
2780 unlock_page(vmf.page);
2781 page_cache_release(vmf.page);
2782 return VM_FAULT_HWPOISON;
2783 }
2784
2785 if (unlikely(!(ret & VM_FAULT_LOCKED)))
2786 lock_page(vmf.page);
2787 else
2788 VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page);
2789
2790 out:
2791 *page = vmf.page;
2792 return ret;
2793}
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810void do_set_pte(struct vm_area_struct *vma, unsigned long address,
2811 struct page *page, pte_t *pte, bool write, bool anon)
2812{
2813 pte_t entry;
2814
2815 flush_icache_page(vma, page);
2816 entry = mk_pte(page, vma->vm_page_prot);
2817 if (write)
2818 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2819 if (anon) {
2820 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
2821 page_add_new_anon_rmap(page, vma, address);
2822 } else {
2823 inc_mm_counter_fast(vma->vm_mm, MM_FILEPAGES);
2824 page_add_file_rmap(page);
2825 }
2826 set_pte_at(vma->vm_mm, address, pte, entry);
2827
2828
2829 update_mmu_cache(vma, address, pte);
2830}
2831
2832static unsigned long fault_around_bytes __read_mostly =
2833 rounddown_pow_of_two(65536);
2834
2835#ifdef CONFIG_DEBUG_FS
2836static int fault_around_bytes_get(void *data, u64 *val)
2837{
2838 *val = fault_around_bytes;
2839 return 0;
2840}
2841
2842
2843
2844
2845
2846
2847static int fault_around_bytes_set(void *data, u64 val)
2848{
2849 if (val / PAGE_SIZE > PTRS_PER_PTE)
2850 return -EINVAL;
2851 if (val > PAGE_SIZE)
2852 fault_around_bytes = rounddown_pow_of_two(val);
2853 else
2854 fault_around_bytes = PAGE_SIZE;
2855 return 0;
2856}
2857DEFINE_SIMPLE_ATTRIBUTE(fault_around_bytes_fops,
2858 fault_around_bytes_get, fault_around_bytes_set, "%llu\n");
2859
2860static int __init fault_around_debugfs(void)
2861{
2862 void *ret;
2863
2864 ret = debugfs_create_file("fault_around_bytes", 0644, NULL, NULL,
2865 &fault_around_bytes_fops);
2866 if (!ret)
2867 pr_warn("Failed to create fault_around_bytes in debugfs");
2868 return 0;
2869}
2870late_initcall(fault_around_debugfs);
2871#endif
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896static void do_fault_around(struct vm_area_struct *vma, unsigned long address,
2897 pte_t *pte, pgoff_t pgoff, unsigned int flags)
2898{
2899 unsigned long start_addr, nr_pages, mask;
2900 pgoff_t max_pgoff;
2901 struct vm_fault vmf;
2902 int off;
2903
2904 nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
2905 mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
2906
2907 start_addr = max(address & mask, vma->vm_start);
2908 off = ((address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
2909 pte -= off;
2910 pgoff -= off;
2911
2912
2913
2914
2915
2916 max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
2917 PTRS_PER_PTE - 1;
2918 max_pgoff = min3(max_pgoff, vma_pages(vma) + vma->vm_pgoff - 1,
2919 pgoff + nr_pages - 1);
2920
2921
2922 while (!pte_none(*pte)) {
2923 if (++pgoff > max_pgoff)
2924 return;
2925 start_addr += PAGE_SIZE;
2926 if (start_addr >= vma->vm_end)
2927 return;
2928 pte++;
2929 }
2930
2931 vmf.virtual_address = (void __user *) start_addr;
2932 vmf.pte = pte;
2933 vmf.pgoff = pgoff;
2934 vmf.max_pgoff = max_pgoff;
2935 vmf.flags = flags;
2936 vma->vm_ops->map_pages(vma, &vmf);
2937}
2938
2939static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2940 unsigned long address, pmd_t *pmd,
2941 pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
2942{
2943 struct page *fault_page;
2944 spinlock_t *ptl;
2945 pte_t *pte;
2946 int ret = 0;
2947
2948
2949
2950
2951
2952
2953 if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
2954 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
2955 do_fault_around(vma, address, pte, pgoff, flags);
2956 if (!pte_same(*pte, orig_pte))
2957 goto unlock_out;
2958 pte_unmap_unlock(pte, ptl);
2959 }
2960
2961 ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);
2962 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
2963 return ret;
2964
2965 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
2966 if (unlikely(!pte_same(*pte, orig_pte))) {
2967 pte_unmap_unlock(pte, ptl);
2968 unlock_page(fault_page);
2969 page_cache_release(fault_page);
2970 return ret;
2971 }
2972 do_set_pte(vma, address, fault_page, pte, false, false);
2973 unlock_page(fault_page);
2974unlock_out:
2975 pte_unmap_unlock(pte, ptl);
2976 return ret;
2977}
2978
2979static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2980 unsigned long address, pmd_t *pmd,
2981 pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
2982{
2983 struct page *fault_page, *new_page;
2984 struct mem_cgroup *memcg;
2985 spinlock_t *ptl;
2986 pte_t *pte;
2987 int ret;
2988
2989 if (unlikely(anon_vma_prepare(vma)))
2990 return VM_FAULT_OOM;
2991
2992 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
2993 if (!new_page)
2994 return VM_FAULT_OOM;
2995
2996 if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) {
2997 page_cache_release(new_page);
2998 return VM_FAULT_OOM;
2999 }
3000
3001 ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page);
3002 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3003 goto uncharge_out;
3004
3005 if (fault_page)
3006 copy_user_highpage(new_page, fault_page, address, vma);
3007 __SetPageUptodate(new_page);
3008
3009 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
3010 if (unlikely(!pte_same(*pte, orig_pte))) {
3011 pte_unmap_unlock(pte, ptl);
3012 if (fault_page) {
3013 unlock_page(fault_page);
3014 page_cache_release(fault_page);
3015 } else {
3016
3017
3018
3019
3020 i_mmap_unlock_read(vma->vm_file->f_mapping);
3021 }
3022 goto uncharge_out;
3023 }
3024 do_set_pte(vma, address, new_page, pte, true, true);
3025 mem_cgroup_commit_charge(new_page, memcg, false);
3026 lru_cache_add_active_or_unevictable(new_page, vma);
3027 pte_unmap_unlock(pte, ptl);
3028 if (fault_page) {
3029 unlock_page(fault_page);
3030 page_cache_release(fault_page);
3031 } else {
3032
3033
3034
3035
3036 i_mmap_unlock_read(vma->vm_file->f_mapping);
3037 }
3038 return ret;
3039uncharge_out:
3040 mem_cgroup_cancel_charge(new_page, memcg);
3041 page_cache_release(new_page);
3042 return ret;
3043}
3044
3045static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3046 unsigned long address, pmd_t *pmd,
3047 pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
3048{
3049 struct page *fault_page;
3050 struct address_space *mapping;
3051 spinlock_t *ptl;
3052 pte_t *pte;
3053 int dirtied = 0;
3054 int ret, tmp;
3055
3056 ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);
3057 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3058 return ret;
3059
3060
3061
3062
3063
3064 if (vma->vm_ops->page_mkwrite) {
3065 unlock_page(fault_page);
3066 tmp = do_page_mkwrite(vma, fault_page, address);
3067 if (unlikely(!tmp ||
3068 (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
3069 page_cache_release(fault_page);
3070 return tmp;
3071 }
3072 }
3073
3074 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
3075 if (unlikely(!pte_same(*pte, orig_pte))) {
3076 pte_unmap_unlock(pte, ptl);
3077 unlock_page(fault_page);
3078 page_cache_release(fault_page);
3079 return ret;
3080 }
3081 do_set_pte(vma, address, fault_page, pte, true, false);
3082 pte_unmap_unlock(pte, ptl);
3083
3084 if (set_page_dirty(fault_page))
3085 dirtied = 1;
3086
3087
3088
3089
3090
3091
3092 mapping = fault_page->mapping;
3093 unlock_page(fault_page);
3094 if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) {
3095
3096
3097
3098
3099 balance_dirty_pages_ratelimited(mapping);
3100 }
3101
3102 if (!vma->vm_ops->page_mkwrite)
3103 file_update_time(vma->vm_file);
3104
3105 return ret;
3106}
3107
3108
3109
3110
3111
3112
3113
3114static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3115 unsigned long address, pte_t *page_table, pmd_t *pmd,
3116 unsigned int flags, pte_t orig_pte)
3117{
3118 pgoff_t pgoff = (((address & PAGE_MASK)
3119 - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
3120
3121 pte_unmap(page_table);
3122
3123 if (!vma->vm_ops->fault)
3124 return VM_FAULT_SIGBUS;
3125 if (!(flags & FAULT_FLAG_WRITE))
3126 return do_read_fault(mm, vma, address, pmd, pgoff, flags,
3127 orig_pte);
3128 if (!(vma->vm_flags & VM_SHARED))
3129 return do_cow_fault(mm, vma, address, pmd, pgoff, flags,
3130 orig_pte);
3131 return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
3132}
3133
3134static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
3135 unsigned long addr, int page_nid,
3136 int *flags)
3137{
3138 get_page(page);
3139
3140 count_vm_numa_event(NUMA_HINT_FAULTS);
3141 if (page_nid == numa_node_id()) {
3142 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
3143 *flags |= TNF_FAULT_LOCAL;
3144 }
3145
3146 return mpol_misplaced(page, vma, addr);
3147}
3148
3149static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3150 unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd)
3151{
3152 struct page *page = NULL;
3153 spinlock_t *ptl;
3154 int page_nid = -1;
3155 int last_cpupid;
3156 int target_nid;
3157 bool migrated = false;
3158 bool was_writable = pte_write(pte);
3159 int flags = 0;
3160
3161
3162 BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)));
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173 ptl = pte_lockptr(mm, pmd);
3174 spin_lock(ptl);
3175 if (unlikely(!pte_same(*ptep, pte))) {
3176 pte_unmap_unlock(ptep, ptl);
3177 goto out;
3178 }
3179
3180
3181 pte = pte_modify(pte, vma->vm_page_prot);
3182 pte = pte_mkyoung(pte);
3183 if (was_writable)
3184 pte = pte_mkwrite(pte);
3185 set_pte_at(mm, addr, ptep, pte);
3186 update_mmu_cache(vma, addr, ptep);
3187
3188 page = vm_normal_page(vma, addr, pte);
3189 if (!page) {
3190 pte_unmap_unlock(ptep, ptl);
3191 return 0;
3192 }
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202 if (!(vma->vm_flags & VM_WRITE))
3203 flags |= TNF_NO_GROUP;
3204
3205
3206
3207
3208
3209 if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
3210 flags |= TNF_SHARED;
3211
3212 last_cpupid = page_cpupid_last(page);
3213 page_nid = page_to_nid(page);
3214 target_nid = numa_migrate_prep(page, vma, addr, page_nid, &flags);
3215 pte_unmap_unlock(ptep, ptl);
3216 if (target_nid == -1) {
3217 put_page(page);
3218 goto out;
3219 }
3220
3221
3222 migrated = migrate_misplaced_page(page, vma, target_nid);
3223 if (migrated) {
3224 page_nid = target_nid;
3225 flags |= TNF_MIGRATED;
3226 } else
3227 flags |= TNF_MIGRATE_FAIL;
3228
3229out:
3230 if (page_nid != -1)
3231 task_numa_fault(last_cpupid, page_nid, 1, flags);
3232 return 0;
3233}
3234
3235static int create_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
3236 unsigned long address, pmd_t *pmd, unsigned int flags)
3237{
3238 if (vma_is_anonymous(vma))
3239 return do_huge_pmd_anonymous_page(mm, vma, address, pmd, flags);
3240 if (vma->vm_ops->pmd_fault)
3241 return vma->vm_ops->pmd_fault(vma, address, pmd, flags);
3242 return VM_FAULT_FALLBACK;
3243}
3244
3245static int wp_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
3246 unsigned long address, pmd_t *pmd, pmd_t orig_pmd,
3247 unsigned int flags)
3248{
3249 if (vma_is_anonymous(vma))
3250 return do_huge_pmd_wp_page(mm, vma, address, pmd, orig_pmd);
3251 if (vma->vm_ops->pmd_fault)
3252 return vma->vm_ops->pmd_fault(vma, address, pmd, flags);
3253 return VM_FAULT_FALLBACK;
3254}
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272static int handle_pte_fault(struct mm_struct *mm,
3273 struct vm_area_struct *vma, unsigned long address,
3274 pte_t *pte, pmd_t *pmd, unsigned int flags)
3275{
3276 pte_t entry;
3277 spinlock_t *ptl;
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287 entry = *pte;
3288 barrier();
3289 if (!pte_present(entry)) {
3290 if (pte_none(entry)) {
3291 if (vma_is_anonymous(vma))
3292 return do_anonymous_page(mm, vma, address,
3293 pte, pmd, flags);
3294 else
3295 return do_fault(mm, vma, address, pte, pmd,
3296 flags, entry);
3297 }
3298 return do_swap_page(mm, vma, address,
3299 pte, pmd, flags, entry);
3300 }
3301
3302 if (pte_protnone(entry))
3303 return do_numa_page(mm, vma, address, entry, pte, pmd);
3304
3305 ptl = pte_lockptr(mm, pmd);
3306 spin_lock(ptl);
3307 if (unlikely(!pte_same(*pte, entry)))
3308 goto unlock;
3309 if (flags & FAULT_FLAG_WRITE) {
3310 if (!pte_write(entry))
3311 return do_wp_page(mm, vma, address,
3312 pte, pmd, ptl, entry);
3313 entry = pte_mkdirty(entry);
3314 }
3315 entry = pte_mkyoung(entry);
3316 if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
3317 update_mmu_cache(vma, address, pte);
3318 } else {
3319
3320
3321
3322
3323
3324
3325 if (flags & FAULT_FLAG_WRITE)
3326 flush_tlb_fix_spurious_fault(vma, address);
3327 }
3328unlock:
3329 pte_unmap_unlock(pte, ptl);
3330 return 0;
3331}
3332
3333
3334
3335
3336
3337
3338
3339static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3340 unsigned long address, unsigned int flags)
3341{
3342 pgd_t *pgd;
3343 pud_t *pud;
3344 pmd_t *pmd;
3345 pte_t *pte;
3346
3347 if (unlikely(is_vm_hugetlb_page(vma)))
3348 return hugetlb_fault(mm, vma, address, flags);
3349
3350 pgd = pgd_offset(mm, address);
3351 pud = pud_alloc(mm, pgd, address);
3352 if (!pud)
3353 return VM_FAULT_OOM;
3354 pmd = pmd_alloc(mm, pud, address);
3355 if (!pmd)
3356 return VM_FAULT_OOM;
3357 if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
3358 int ret = create_huge_pmd(mm, vma, address, pmd, flags);
3359 if (!(ret & VM_FAULT_FALLBACK))
3360 return ret;
3361 } else {
3362 pmd_t orig_pmd = *pmd;
3363 int ret;
3364
3365 barrier();
3366 if (pmd_trans_huge(orig_pmd)) {
3367 unsigned int dirty = flags & FAULT_FLAG_WRITE;
3368
3369
3370
3371
3372
3373
3374 if (pmd_trans_splitting(orig_pmd))
3375 return 0;
3376
3377 if (pmd_protnone(orig_pmd))
3378 return do_huge_pmd_numa_page(mm, vma, address,
3379 orig_pmd, pmd);
3380
3381 if (dirty && !pmd_write(orig_pmd)) {
3382 ret = wp_huge_pmd(mm, vma, address, pmd,
3383 orig_pmd, flags);
3384 if (!(ret & VM_FAULT_FALLBACK))
3385 return ret;
3386 } else {
3387 huge_pmd_set_accessed(mm, vma, address, pmd,
3388 orig_pmd, dirty);
3389 return 0;
3390 }
3391 }
3392 }
3393
3394
3395
3396
3397
3398
3399 if (unlikely(pmd_none(*pmd)) &&
3400 unlikely(__pte_alloc(mm, vma, pmd, address)))
3401 return VM_FAULT_OOM;
3402
3403 if (unlikely(pmd_trans_huge(*pmd)))
3404 return 0;
3405
3406
3407
3408
3409
3410
3411 pte = pte_offset_map(pmd, address);
3412
3413 return handle_pte_fault(mm, vma, address, pte, pmd, flags);
3414}
3415
3416
3417
3418
3419
3420
3421
3422int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3423 unsigned long address, unsigned int flags)
3424{
3425 int ret;
3426
3427 __set_current_state(TASK_RUNNING);
3428
3429 count_vm_event(PGFAULT);
3430 mem_cgroup_count_vm_event(mm, PGFAULT);
3431
3432
3433 check_sync_rss_stat(current);
3434
3435
3436
3437
3438
3439 if (flags & FAULT_FLAG_USER)
3440 mem_cgroup_oom_enable();
3441
3442 ret = __handle_mm_fault(mm, vma, address, flags);
3443
3444 if (flags & FAULT_FLAG_USER) {
3445 mem_cgroup_oom_disable();
3446
3447
3448
3449
3450
3451
3452 if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
3453 mem_cgroup_oom_synchronize(false);
3454 }
3455
3456 return ret;
3457}
3458EXPORT_SYMBOL_GPL(handle_mm_fault);
3459
3460#ifndef __PAGETABLE_PUD_FOLDED
3461
3462
3463
3464
3465int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
3466{
3467 pud_t *new = pud_alloc_one(mm, address);
3468 if (!new)
3469 return -ENOMEM;
3470
3471 smp_wmb();
3472
3473 spin_lock(&mm->page_table_lock);
3474 if (pgd_present(*pgd))
3475 pud_free(mm, new);
3476 else
3477 pgd_populate(mm, pgd, new);
3478 spin_unlock(&mm->page_table_lock);
3479 return 0;
3480}
3481#endif
3482
3483#ifndef __PAGETABLE_PMD_FOLDED
3484
3485
3486
3487
3488int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
3489{
3490 pmd_t *new = pmd_alloc_one(mm, address);
3491 if (!new)
3492 return -ENOMEM;
3493
3494 smp_wmb();
3495
3496 spin_lock(&mm->page_table_lock);
3497#ifndef __ARCH_HAS_4LEVEL_HACK
3498 if (!pud_present(*pud)) {
3499 mm_inc_nr_pmds(mm);
3500 pud_populate(mm, pud, new);
3501 } else
3502 pmd_free(mm, new);
3503#else
3504 if (!pgd_present(*pud)) {
3505 mm_inc_nr_pmds(mm);
3506 pgd_populate(mm, pud, new);
3507 } else
3508 pmd_free(mm, new);
3509#endif
3510 spin_unlock(&mm->page_table_lock);
3511 return 0;
3512}
3513#endif
3514
3515static int __follow_pte(struct mm_struct *mm, unsigned long address,
3516 pte_t **ptepp, spinlock_t **ptlp)
3517{
3518 pgd_t *pgd;
3519 pud_t *pud;
3520 pmd_t *pmd;
3521 pte_t *ptep;
3522
3523 pgd = pgd_offset(mm, address);
3524 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
3525 goto out;
3526
3527 pud = pud_offset(pgd, address);
3528 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
3529 goto out;
3530
3531 pmd = pmd_offset(pud, address);
3532 VM_BUG_ON(pmd_trans_huge(*pmd));
3533 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
3534 goto out;
3535
3536
3537 if (pmd_huge(*pmd))
3538 goto out;
3539
3540 ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
3541 if (!ptep)
3542 goto out;
3543 if (!pte_present(*ptep))
3544 goto unlock;
3545 *ptepp = ptep;
3546 return 0;
3547unlock:
3548 pte_unmap_unlock(ptep, *ptlp);
3549out:
3550 return -EINVAL;
3551}
3552
3553static inline int follow_pte(struct mm_struct *mm, unsigned long address,
3554 pte_t **ptepp, spinlock_t **ptlp)
3555{
3556 int res;
3557
3558
3559 (void) __cond_lock(*ptlp,
3560 !(res = __follow_pte(mm, address, ptepp, ptlp)));
3561 return res;
3562}
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574int follow_pfn(struct vm_area_struct *vma, unsigned long address,
3575 unsigned long *pfn)
3576{
3577 int ret = -EINVAL;
3578 spinlock_t *ptl;
3579 pte_t *ptep;
3580
3581 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
3582 return ret;
3583
3584 ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
3585 if (ret)
3586 return ret;
3587 *pfn = pte_pfn(*ptep);
3588 pte_unmap_unlock(ptep, ptl);
3589 return 0;
3590}
3591EXPORT_SYMBOL(follow_pfn);
3592
3593#ifdef CONFIG_HAVE_IOREMAP_PROT
3594int follow_phys(struct vm_area_struct *vma,
3595 unsigned long address, unsigned int flags,
3596 unsigned long *prot, resource_size_t *phys)
3597{
3598 int ret = -EINVAL;
3599 pte_t *ptep, pte;
3600 spinlock_t *ptl;
3601
3602 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
3603 goto out;
3604
3605 if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
3606 goto out;
3607 pte = *ptep;
3608
3609 if ((flags & FOLL_WRITE) && !pte_write(pte))
3610 goto unlock;
3611
3612 *prot = pgprot_val(pte_pgprot(pte));
3613 *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
3614
3615 ret = 0;
3616unlock:
3617 pte_unmap_unlock(ptep, ptl);
3618out:
3619 return ret;
3620}
3621
3622int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
3623 void *buf, int len, int write)
3624{
3625 resource_size_t phys_addr;
3626 unsigned long prot = 0;
3627 void __iomem *maddr;
3628 int offset = addr & (PAGE_SIZE-1);
3629
3630 if (follow_phys(vma, addr, write, &prot, &phys_addr))
3631 return -EINVAL;
3632
3633 maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
3634 if (write)
3635 memcpy_toio(maddr + offset, buf, len);
3636 else
3637 memcpy_fromio(buf, maddr + offset, len);
3638 iounmap(maddr);
3639
3640 return len;
3641}
3642EXPORT_SYMBOL_GPL(generic_access_phys);
3643#endif
3644
3645
3646
3647
3648
3649static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
3650 unsigned long addr, void *buf, int len, int write)
3651{
3652 struct vm_area_struct *vma;
3653 void *old_buf = buf;
3654
3655 down_read(&mm->mmap_sem);
3656
3657 while (len) {
3658 int bytes, ret, offset;
3659 void *maddr;
3660 struct page *page = NULL;
3661
3662 ret = get_user_pages(tsk, mm, addr, 1,
3663 write, 1, &page, &vma);
3664 if (ret <= 0) {
3665#ifndef CONFIG_HAVE_IOREMAP_PROT
3666 break;
3667#else
3668
3669
3670
3671
3672 vma = find_vma(mm, addr);
3673 if (!vma || vma->vm_start > addr)
3674 break;
3675 if (vma->vm_ops && vma->vm_ops->access)
3676 ret = vma->vm_ops->access(vma, addr, buf,
3677 len, write);
3678 if (ret <= 0)
3679 break;
3680 bytes = ret;
3681#endif
3682 } else {
3683 bytes = len;
3684 offset = addr & (PAGE_SIZE-1);
3685 if (bytes > PAGE_SIZE-offset)
3686 bytes = PAGE_SIZE-offset;
3687
3688 maddr = kmap(page);
3689 if (write) {
3690 copy_to_user_page(vma, page, addr,
3691 maddr + offset, buf, bytes);
3692 set_page_dirty_lock(page);
3693 } else {
3694 copy_from_user_page(vma, page, addr,
3695 buf, maddr + offset, bytes);
3696 }
3697 kunmap(page);
3698 page_cache_release(page);
3699 }
3700 len -= bytes;
3701 buf += bytes;
3702 addr += bytes;
3703 }
3704 up_read(&mm->mmap_sem);
3705
3706 return buf - old_buf;
3707}
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719int access_remote_vm(struct mm_struct *mm, unsigned long addr,
3720 void *buf, int len, int write)
3721{
3722 return __access_remote_vm(NULL, mm, addr, buf, len, write);
3723}
3724
3725
3726
3727
3728
3729
3730int access_process_vm(struct task_struct *tsk, unsigned long addr,
3731 void *buf, int len, int write)
3732{
3733 struct mm_struct *mm;
3734 int ret;
3735
3736 mm = get_task_mm(tsk);
3737 if (!mm)
3738 return 0;
3739
3740 ret = __access_remote_vm(tsk, mm, addr, buf, len, write);
3741 mmput(mm);
3742
3743 return ret;
3744}
3745
3746
3747
3748
3749void print_vma_addr(char *prefix, unsigned long ip)
3750{
3751 struct mm_struct *mm = current->mm;
3752 struct vm_area_struct *vma;
3753
3754
3755
3756
3757
3758 if (preempt_count())
3759 return;
3760
3761 down_read(&mm->mmap_sem);
3762 vma = find_vma(mm, ip);
3763 if (vma && vma->vm_file) {
3764 struct file *f = vma->vm_file;
3765 char *buf = (char *)__get_free_page(GFP_KERNEL);
3766 if (buf) {
3767 char *p;
3768
3769 p = file_path(f, buf, PAGE_SIZE);
3770 if (IS_ERR(p))
3771 p = "?";
3772 printk("%s%s[%lx+%lx]", prefix, kbasename(p),
3773 vma->vm_start,
3774 vma->vm_end - vma->vm_start);
3775 free_page((unsigned long)buf);
3776 }
3777 }
3778 up_read(&mm->mmap_sem);
3779}
3780
3781#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
3782void __might_fault(const char *file, int line)
3783{
3784
3785
3786
3787
3788
3789
3790 if (segment_eq(get_fs(), KERNEL_DS))
3791 return;
3792 if (pagefault_disabled())
3793 return;
3794 __might_sleep(file, line, 0);
3795#if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
3796 if (current->mm)
3797 might_lock_read(¤t->mm->mmap_sem);
3798#endif
3799}
3800EXPORT_SYMBOL(__might_fault);
3801#endif
3802
3803#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
3804static void clear_gigantic_page(struct page *page,
3805 unsigned long addr,
3806 unsigned int pages_per_huge_page)
3807{
3808 int i;
3809 struct page *p = page;
3810
3811 might_sleep();
3812 for (i = 0; i < pages_per_huge_page;
3813 i++, p = mem_map_next(p, page, i)) {
3814 cond_resched();
3815 clear_user_highpage(p, addr + i * PAGE_SIZE);
3816 }
3817}
3818void clear_huge_page(struct page *page,
3819 unsigned long addr, unsigned int pages_per_huge_page)
3820{
3821 int i;
3822
3823 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
3824 clear_gigantic_page(page, addr, pages_per_huge_page);
3825 return;
3826 }
3827
3828 might_sleep();
3829 for (i = 0; i < pages_per_huge_page; i++) {
3830 cond_resched();
3831 clear_user_highpage(page + i, addr + i * PAGE_SIZE);
3832 }
3833}
3834
3835static void copy_user_gigantic_page(struct page *dst, struct page *src,
3836 unsigned long addr,
3837 struct vm_area_struct *vma,
3838 unsigned int pages_per_huge_page)
3839{
3840 int i;
3841 struct page *dst_base = dst;
3842 struct page *src_base = src;
3843
3844 for (i = 0; i < pages_per_huge_page; ) {
3845 cond_resched();
3846 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
3847
3848 i++;
3849 dst = mem_map_next(dst, dst_base, i);
3850 src = mem_map_next(src, src_base, i);
3851 }
3852}
3853
3854void copy_user_huge_page(struct page *dst, struct page *src,
3855 unsigned long addr, struct vm_area_struct *vma,
3856 unsigned int pages_per_huge_page)
3857{
3858 int i;
3859
3860 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
3861 copy_user_gigantic_page(dst, src, addr, vma,
3862 pages_per_huge_page);
3863 return;
3864 }
3865
3866 might_sleep();
3867 for (i = 0; i < pages_per_huge_page; i++) {
3868 cond_resched();
3869 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
3870 }
3871}
3872#endif
3873
3874#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS
3875
3876static struct kmem_cache *page_ptl_cachep;
3877
3878void __init ptlock_cache_init(void)
3879{
3880 page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0,
3881 SLAB_PANIC, NULL);
3882}
3883
3884bool ptlock_alloc(struct page *page)
3885{
3886 spinlock_t *ptl;
3887
3888 ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
3889 if (!ptl)
3890 return false;
3891 page->ptl = ptl;
3892 return true;
3893}
3894
3895void ptlock_free(struct page *page)
3896{
3897 kmem_cache_free(page_ptl_cachep, page->ptl);
3898}
3899#endif
3900