1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41#include <linux/kernel_stat.h>
42#include <linux/mm.h>
43#include <linux/hugetlb.h>
44#include <linux/mman.h>
45#include <linux/swap.h>
46#include <linux/highmem.h>
47#include <linux/pagemap.h>
48#include <linux/ksm.h>
49#include <linux/rmap.h>
50#include <linux/export.h>
51#include <linux/delayacct.h>
52#include <linux/init.h>
53#include <linux/writeback.h>
54#include <linux/memcontrol.h>
55#include <linux/mmu_notifier.h>
56#include <linux/kallsyms.h>
57#include <linux/swapops.h>
58#include <linux/elf.h>
59#include <linux/gfp.h>
60#include <linux/migrate.h>
61#include <linux/string.h>
62
63#include <asm/io.h>
64#include <asm/pgalloc.h>
65#include <asm/uaccess.h>
66#include <asm/tlb.h>
67#include <asm/tlbflush.h>
68#include <asm/pgtable.h>
69
70#include "internal.h"
71
72#ifdef LAST_NID_NOT_IN_PAGE_FLAGS
73#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_nid.
74#endif
75
76#ifndef CONFIG_NEED_MULTIPLE_NODES
77
78unsigned long max_mapnr;
79struct page *mem_map;
80
81EXPORT_SYMBOL(max_mapnr);
82EXPORT_SYMBOL(mem_map);
83#endif
84
85
86
87
88
89
90
91
92void * high_memory;
93
94EXPORT_SYMBOL(high_memory);
95
96
97
98
99
100
101
102int randomize_va_space __read_mostly =
103#ifdef CONFIG_COMPAT_BRK
104 1;
105#else
106 2;
107#endif
108
109static int __init disable_randmaps(char *s)
110{
111 randomize_va_space = 0;
112 return 1;
113}
114__setup("norandmaps", disable_randmaps);
115
116unsigned long zero_pfn __read_mostly;
117unsigned long highest_memmap_pfn __read_mostly;
118
119
120
121
122static int __init init_zero_pfn(void)
123{
124 zero_pfn = page_to_pfn(ZERO_PAGE(0));
125 return 0;
126}
127core_initcall(init_zero_pfn);
128
129
130#if defined(SPLIT_RSS_COUNTING)
131
132void sync_mm_rss(struct mm_struct *mm)
133{
134 int i;
135
136 for (i = 0; i < NR_MM_COUNTERS; i++) {
137 if (current->rss_stat.count[i]) {
138 add_mm_counter(mm, i, current->rss_stat.count[i]);
139 current->rss_stat.count[i] = 0;
140 }
141 }
142 current->rss_stat.events = 0;
143}
144
145static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
146{
147 struct task_struct *task = current;
148
149 if (likely(task->mm == mm))
150 task->rss_stat.count[member] += val;
151 else
152 add_mm_counter(mm, member, val);
153}
154#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
155#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
156
157
158#define TASK_RSS_EVENTS_THRESH (64)
159static void check_sync_rss_stat(struct task_struct *task)
160{
161 if (unlikely(task != current))
162 return;
163 if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
164 sync_mm_rss(task->mm);
165}
166#else
167
168#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
169#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
170
171static void check_sync_rss_stat(struct task_struct *task)
172{
173}
174
175#endif
176
177#ifdef HAVE_GENERIC_MMU_GATHER
178
179static int tlb_next_batch(struct mmu_gather *tlb)
180{
181 struct mmu_gather_batch *batch;
182
183 batch = tlb->active;
184 if (batch->next) {
185 tlb->active = batch->next;
186 return 1;
187 }
188
189 if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
190 return 0;
191
192 batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
193 if (!batch)
194 return 0;
195
196 tlb->batch_count++;
197 batch->next = NULL;
198 batch->nr = 0;
199 batch->max = MAX_GATHER_BATCH;
200
201 tlb->active->next = batch;
202 tlb->active = batch;
203
204 return 1;
205}
206
207
208
209
210
211
212void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end)
213{
214 tlb->mm = mm;
215
216
217 tlb->fullmm = !(start | (end+1));
218 tlb->need_flush_all = 0;
219 tlb->start = start;
220 tlb->end = end;
221 tlb->need_flush = 0;
222 tlb->local.next = NULL;
223 tlb->local.nr = 0;
224 tlb->local.max = ARRAY_SIZE(tlb->__pages);
225 tlb->active = &tlb->local;
226 tlb->batch_count = 0;
227
228#ifdef CONFIG_HAVE_RCU_TABLE_FREE
229 tlb->batch = NULL;
230#endif
231}
232
233void tlb_flush_mmu(struct mmu_gather *tlb)
234{
235 struct mmu_gather_batch *batch;
236
237 if (!tlb->need_flush)
238 return;
239 tlb->need_flush = 0;
240 tlb_flush(tlb);
241#ifdef CONFIG_HAVE_RCU_TABLE_FREE
242 tlb_table_flush(tlb);
243#endif
244
245 for (batch = &tlb->local; batch; batch = batch->next) {
246 free_pages_and_swap_cache(batch->pages, batch->nr);
247 batch->nr = 0;
248 }
249 tlb->active = &tlb->local;
250}
251
252
253
254
255
256void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
257{
258 struct mmu_gather_batch *batch, *next;
259
260 tlb_flush_mmu(tlb);
261
262
263 check_pgt_cache();
264
265 for (batch = tlb->local.next; batch; batch = next) {
266 next = batch->next;
267 free_pages((unsigned long)batch, 0);
268 }
269 tlb->local.next = NULL;
270}
271
272
273
274
275
276
277
278int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
279{
280 struct mmu_gather_batch *batch;
281
282 VM_BUG_ON(!tlb->need_flush);
283
284 batch = tlb->active;
285 batch->pages[batch->nr++] = page;
286 if (batch->nr == batch->max) {
287 if (!tlb_next_batch(tlb))
288 return 0;
289 batch = tlb->active;
290 }
291 VM_BUG_ON(batch->nr > batch->max);
292
293 return batch->max - batch->nr;
294}
295
296#endif
297
298#ifdef CONFIG_HAVE_RCU_TABLE_FREE
299
300
301
302
303
304static void tlb_remove_table_smp_sync(void *arg)
305{
306
307}
308
309static void tlb_remove_table_one(void *table)
310{
311
312
313
314
315
316
317
318 smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
319 __tlb_remove_table(table);
320}
321
322static void tlb_remove_table_rcu(struct rcu_head *head)
323{
324 struct mmu_table_batch *batch;
325 int i;
326
327 batch = container_of(head, struct mmu_table_batch, rcu);
328
329 for (i = 0; i < batch->nr; i++)
330 __tlb_remove_table(batch->tables[i]);
331
332 free_page((unsigned long)batch);
333}
334
335void tlb_table_flush(struct mmu_gather *tlb)
336{
337 struct mmu_table_batch **batch = &tlb->batch;
338
339 if (*batch) {
340 call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
341 *batch = NULL;
342 }
343}
344
345void tlb_remove_table(struct mmu_gather *tlb, void *table)
346{
347 struct mmu_table_batch **batch = &tlb->batch;
348
349 tlb->need_flush = 1;
350
351
352
353
354
355 if (atomic_read(&tlb->mm->mm_users) < 2) {
356 __tlb_remove_table(table);
357 return;
358 }
359
360 if (*batch == NULL) {
361 *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
362 if (*batch == NULL) {
363 tlb_remove_table_one(table);
364 return;
365 }
366 (*batch)->nr = 0;
367 }
368 (*batch)->tables[(*batch)->nr++] = table;
369 if ((*batch)->nr == MAX_TABLE_BATCH)
370 tlb_table_flush(tlb);
371}
372
373#endif
374
375
376
377
378
379static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
380 unsigned long addr)
381{
382 pgtable_t token = pmd_pgtable(*pmd);
383 pmd_clear(pmd);
384 pte_free_tlb(tlb, token, addr);
385 tlb->mm->nr_ptes--;
386}
387
388static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
389 unsigned long addr, unsigned long end,
390 unsigned long floor, unsigned long ceiling)
391{
392 pmd_t *pmd;
393 unsigned long next;
394 unsigned long start;
395
396 start = addr;
397 pmd = pmd_offset(pud, addr);
398 do {
399 next = pmd_addr_end(addr, end);
400 if (pmd_none_or_clear_bad(pmd))
401 continue;
402 free_pte_range(tlb, pmd, addr);
403 } while (pmd++, addr = next, addr != end);
404
405 start &= PUD_MASK;
406 if (start < floor)
407 return;
408 if (ceiling) {
409 ceiling &= PUD_MASK;
410 if (!ceiling)
411 return;
412 }
413 if (end - 1 > ceiling - 1)
414 return;
415
416 pmd = pmd_offset(pud, start);
417 pud_clear(pud);
418 pmd_free_tlb(tlb, pmd, start);
419}
420
421static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
422 unsigned long addr, unsigned long end,
423 unsigned long floor, unsigned long ceiling)
424{
425 pud_t *pud;
426 unsigned long next;
427 unsigned long start;
428
429 start = addr;
430 pud = pud_offset(pgd, addr);
431 do {
432 next = pud_addr_end(addr, end);
433 if (pud_none_or_clear_bad(pud))
434 continue;
435 free_pmd_range(tlb, pud, addr, next, floor, ceiling);
436 } while (pud++, addr = next, addr != end);
437
438 start &= PGDIR_MASK;
439 if (start < floor)
440 return;
441 if (ceiling) {
442 ceiling &= PGDIR_MASK;
443 if (!ceiling)
444 return;
445 }
446 if (end - 1 > ceiling - 1)
447 return;
448
449 pud = pud_offset(pgd, start);
450 pgd_clear(pgd);
451 pud_free_tlb(tlb, pud, start);
452}
453
454
455
456
457
458
459void free_pgd_range(struct mmu_gather *tlb,
460 unsigned long addr, unsigned long end,
461 unsigned long floor, unsigned long ceiling)
462{
463 pgd_t *pgd;
464 unsigned long next;
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492 addr &= PMD_MASK;
493 if (addr < floor) {
494 addr += PMD_SIZE;
495 if (!addr)
496 return;
497 }
498 if (ceiling) {
499 ceiling &= PMD_MASK;
500 if (!ceiling)
501 return;
502 }
503 if (end - 1 > ceiling - 1)
504 end -= PMD_SIZE;
505 if (addr > end - 1)
506 return;
507
508 pgd = pgd_offset(tlb->mm, addr);
509 do {
510 next = pgd_addr_end(addr, end);
511 if (pgd_none_or_clear_bad(pgd))
512 continue;
513 free_pud_range(tlb, pgd, addr, next, floor, ceiling);
514 } while (pgd++, addr = next, addr != end);
515}
516
517void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
518 unsigned long floor, unsigned long ceiling)
519{
520 while (vma) {
521 struct vm_area_struct *next = vma->vm_next;
522 unsigned long addr = vma->vm_start;
523
524
525
526
527
528 unlink_anon_vmas(vma);
529 unlink_file_vma(vma);
530
531 if (is_vm_hugetlb_page(vma)) {
532 hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
533 floor, next? next->vm_start: ceiling);
534 } else {
535
536
537
538 while (next && next->vm_start <= vma->vm_end + PMD_SIZE
539 && !is_vm_hugetlb_page(next)) {
540 vma = next;
541 next = vma->vm_next;
542 unlink_anon_vmas(vma);
543 unlink_file_vma(vma);
544 }
545 free_pgd_range(tlb, addr, vma->vm_end,
546 floor, next? next->vm_start: ceiling);
547 }
548 vma = next;
549 }
550}
551
552int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
553 pmd_t *pmd, unsigned long address)
554{
555 pgtable_t new = pte_alloc_one(mm, address);
556 int wait_split_huge_page;
557 if (!new)
558 return -ENOMEM;
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573 smp_wmb();
574
575 spin_lock(&mm->page_table_lock);
576 wait_split_huge_page = 0;
577 if (likely(pmd_none(*pmd))) {
578 mm->nr_ptes++;
579 pmd_populate(mm, pmd, new);
580 new = NULL;
581 } else if (unlikely(pmd_trans_splitting(*pmd)))
582 wait_split_huge_page = 1;
583 spin_unlock(&mm->page_table_lock);
584 if (new)
585 pte_free(mm, new);
586 if (wait_split_huge_page)
587 wait_split_huge_page(vma->anon_vma, pmd);
588 return 0;
589}
590
591int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
592{
593 pte_t *new = pte_alloc_one_kernel(&init_mm, address);
594 if (!new)
595 return -ENOMEM;
596
597 smp_wmb();
598
599 spin_lock(&init_mm.page_table_lock);
600 if (likely(pmd_none(*pmd))) {
601 pmd_populate_kernel(&init_mm, pmd, new);
602 new = NULL;
603 } else
604 VM_BUG_ON(pmd_trans_splitting(*pmd));
605 spin_unlock(&init_mm.page_table_lock);
606 if (new)
607 pte_free_kernel(&init_mm, new);
608 return 0;
609}
610
611static inline void init_rss_vec(int *rss)
612{
613 memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
614}
615
616static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
617{
618 int i;
619
620 if (current->mm == mm)
621 sync_mm_rss(mm);
622 for (i = 0; i < NR_MM_COUNTERS; i++)
623 if (rss[i])
624 add_mm_counter(mm, i, rss[i]);
625}
626
627
628
629
630
631
632
633
634static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
635 pte_t pte, struct page *page)
636{
637 pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
638 pud_t *pud = pud_offset(pgd, addr);
639 pmd_t *pmd = pmd_offset(pud, addr);
640 struct address_space *mapping;
641 pgoff_t index;
642 static unsigned long resume;
643 static unsigned long nr_shown;
644 static unsigned long nr_unshown;
645
646
647
648
649
650 if (nr_shown == 60) {
651 if (time_before(jiffies, resume)) {
652 nr_unshown++;
653 return;
654 }
655 if (nr_unshown) {
656 printk(KERN_ALERT
657 "BUG: Bad page map: %lu messages suppressed\n",
658 nr_unshown);
659 nr_unshown = 0;
660 }
661 nr_shown = 0;
662 }
663 if (nr_shown++ == 0)
664 resume = jiffies + 60 * HZ;
665
666 mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
667 index = linear_page_index(vma, addr);
668
669 printk(KERN_ALERT
670 "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
671 current->comm,
672 (long long)pte_val(pte), (long long)pmd_val(*pmd));
673 if (page)
674 dump_page(page);
675 printk(KERN_ALERT
676 "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
677 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
678
679
680
681 if (vma->vm_ops)
682 printk(KERN_ALERT "vma->vm_ops->fault: %pSR\n",
683 vma->vm_ops->fault);
684 if (vma->vm_file && vma->vm_file->f_op)
685 printk(KERN_ALERT "vma->vm_file->f_op->mmap: %pSR\n",
686 vma->vm_file->f_op->mmap);
687 dump_stack();
688 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
689}
690
691static inline bool is_cow_mapping(vm_flags_t flags)
692{
693 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
694}
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738#ifdef __HAVE_ARCH_PTE_SPECIAL
739# define HAVE_PTE_SPECIAL 1
740#else
741# define HAVE_PTE_SPECIAL 0
742#endif
743struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
744 pte_t pte)
745{
746 unsigned long pfn = pte_pfn(pte);
747
748 if (HAVE_PTE_SPECIAL) {
749 if (likely(!pte_special(pte)))
750 goto check_pfn;
751 if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
752 return NULL;
753 if (!is_zero_pfn(pfn))
754 print_bad_pte(vma, addr, pte, NULL);
755 return NULL;
756 }
757
758
759
760 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
761 if (vma->vm_flags & VM_MIXEDMAP) {
762 if (!pfn_valid(pfn))
763 return NULL;
764 goto out;
765 } else {
766 unsigned long off;
767 off = (addr - vma->vm_start) >> PAGE_SHIFT;
768 if (pfn == vma->vm_pgoff + off)
769 return NULL;
770 if (!is_cow_mapping(vma->vm_flags))
771 return NULL;
772 }
773 }
774
775 if (is_zero_pfn(pfn))
776 return NULL;
777check_pfn:
778 if (unlikely(pfn > highest_memmap_pfn)) {
779 print_bad_pte(vma, addr, pte, NULL);
780 return NULL;
781 }
782
783
784
785
786
787out:
788 return pfn_to_page(pfn);
789}
790
791
792
793
794
795
796
797static inline unsigned long
798copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
799 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
800 unsigned long addr, int *rss)
801{
802 unsigned long vm_flags = vma->vm_flags;
803 pte_t pte = *src_pte;
804 struct page *page;
805
806
807 if (unlikely(!pte_present(pte))) {
808 if (!pte_file(pte)) {
809 swp_entry_t entry = pte_to_swp_entry(pte);
810
811 if (swap_duplicate(entry) < 0)
812 return entry.val;
813
814
815 if (unlikely(list_empty(&dst_mm->mmlist))) {
816 spin_lock(&mmlist_lock);
817 if (list_empty(&dst_mm->mmlist))
818 list_add(&dst_mm->mmlist,
819 &src_mm->mmlist);
820 spin_unlock(&mmlist_lock);
821 }
822 if (likely(!non_swap_entry(entry)))
823 rss[MM_SWAPENTS]++;
824 else if (is_migration_entry(entry)) {
825 page = migration_entry_to_page(entry);
826
827 if (PageAnon(page))
828 rss[MM_ANONPAGES]++;
829 else
830 rss[MM_FILEPAGES]++;
831
832 if (is_write_migration_entry(entry) &&
833 is_cow_mapping(vm_flags)) {
834
835
836
837
838 make_migration_entry_read(&entry);
839 pte = swp_entry_to_pte(entry);
840 if (pte_swp_soft_dirty(*src_pte))
841 pte = pte_swp_mksoft_dirty(pte);
842 set_pte_at(src_mm, addr, src_pte, pte);
843 }
844 }
845 }
846 goto out_set_pte;
847 }
848
849
850
851
852
853 if (is_cow_mapping(vm_flags)) {
854 ptep_set_wrprotect(src_mm, addr, src_pte);
855 pte = pte_wrprotect(pte);
856 }
857
858
859
860
861
862 if (vm_flags & VM_SHARED)
863 pte = pte_mkclean(pte);
864 pte = pte_mkold(pte);
865
866 page = vm_normal_page(vma, addr, pte);
867 if (page) {
868 get_page(page);
869 page_dup_rmap(page);
870 if (PageAnon(page))
871 rss[MM_ANONPAGES]++;
872 else
873 rss[MM_FILEPAGES]++;
874 }
875
876out_set_pte:
877 set_pte_at(dst_mm, addr, dst_pte, pte);
878 return 0;
879}
880
881int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
882 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
883 unsigned long addr, unsigned long end)
884{
885 pte_t *orig_src_pte, *orig_dst_pte;
886 pte_t *src_pte, *dst_pte;
887 spinlock_t *src_ptl, *dst_ptl;
888 int progress = 0;
889 int rss[NR_MM_COUNTERS];
890 swp_entry_t entry = (swp_entry_t){0};
891
892again:
893 init_rss_vec(rss);
894
895 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
896 if (!dst_pte)
897 return -ENOMEM;
898 src_pte = pte_offset_map(src_pmd, addr);
899 src_ptl = pte_lockptr(src_mm, src_pmd);
900 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
901 orig_src_pte = src_pte;
902 orig_dst_pte = dst_pte;
903 arch_enter_lazy_mmu_mode();
904
905 do {
906
907
908
909
910 if (progress >= 32) {
911 progress = 0;
912 if (need_resched() ||
913 spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
914 break;
915 }
916 if (pte_none(*src_pte)) {
917 progress++;
918 continue;
919 }
920 entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
921 vma, addr, rss);
922 if (entry.val)
923 break;
924 progress += 8;
925 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
926
927 arch_leave_lazy_mmu_mode();
928 spin_unlock(src_ptl);
929 pte_unmap(orig_src_pte);
930 add_mm_rss_vec(dst_mm, rss);
931 pte_unmap_unlock(orig_dst_pte, dst_ptl);
932 cond_resched();
933
934 if (entry.val) {
935 if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
936 return -ENOMEM;
937 progress = 0;
938 }
939 if (addr != end)
940 goto again;
941 return 0;
942}
943
944static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
945 pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
946 unsigned long addr, unsigned long end)
947{
948 pmd_t *src_pmd, *dst_pmd;
949 unsigned long next;
950
951 dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
952 if (!dst_pmd)
953 return -ENOMEM;
954 src_pmd = pmd_offset(src_pud, addr);
955 do {
956 next = pmd_addr_end(addr, end);
957 if (pmd_trans_huge(*src_pmd)) {
958 int err;
959 VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);
960 err = copy_huge_pmd(dst_mm, src_mm,
961 dst_pmd, src_pmd, addr, vma);
962 if (err == -ENOMEM)
963 return -ENOMEM;
964 if (!err)
965 continue;
966
967 }
968 if (pmd_none_or_clear_bad(src_pmd))
969 continue;
970 if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
971 vma, addr, next))
972 return -ENOMEM;
973 } while (dst_pmd++, src_pmd++, addr = next, addr != end);
974 return 0;
975}
976
977static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
978 pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
979 unsigned long addr, unsigned long end)
980{
981 pud_t *src_pud, *dst_pud;
982 unsigned long next;
983
984 dst_pud = pud_alloc(dst_mm, dst_pgd, addr);
985 if (!dst_pud)
986 return -ENOMEM;
987 src_pud = pud_offset(src_pgd, addr);
988 do {
989 next = pud_addr_end(addr, end);
990 if (pud_none_or_clear_bad(src_pud))
991 continue;
992 if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
993 vma, addr, next))
994 return -ENOMEM;
995 } while (dst_pud++, src_pud++, addr = next, addr != end);
996 return 0;
997}
998
999int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1000 struct vm_area_struct *vma)
1001{
1002 pgd_t *src_pgd, *dst_pgd;
1003 unsigned long next;
1004 unsigned long addr = vma->vm_start;
1005 unsigned long end = vma->vm_end;
1006 unsigned long mmun_start;
1007 unsigned long mmun_end;
1008 bool is_cow;
1009 int ret;
1010
1011
1012
1013
1014
1015
1016
1017 if (!(vma->vm_flags & (VM_HUGETLB | VM_NONLINEAR |
1018 VM_PFNMAP | VM_MIXEDMAP))) {
1019 if (!vma->anon_vma)
1020 return 0;
1021 }
1022
1023 if (is_vm_hugetlb_page(vma))
1024 return copy_hugetlb_page_range(dst_mm, src_mm, vma);
1025
1026 if (unlikely(vma->vm_flags & VM_PFNMAP)) {
1027
1028
1029
1030
1031 ret = track_pfn_copy(vma);
1032 if (ret)
1033 return ret;
1034 }
1035
1036
1037
1038
1039
1040
1041
1042 is_cow = is_cow_mapping(vma->vm_flags);
1043 mmun_start = addr;
1044 mmun_end = end;
1045 if (is_cow)
1046 mmu_notifier_invalidate_range_start(src_mm, mmun_start,
1047 mmun_end);
1048
1049 ret = 0;
1050 dst_pgd = pgd_offset(dst_mm, addr);
1051 src_pgd = pgd_offset(src_mm, addr);
1052 do {
1053 next = pgd_addr_end(addr, end);
1054 if (pgd_none_or_clear_bad(src_pgd))
1055 continue;
1056 if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
1057 vma, addr, next))) {
1058 ret = -ENOMEM;
1059 break;
1060 }
1061 } while (dst_pgd++, src_pgd++, addr = next, addr != end);
1062
1063 if (is_cow)
1064 mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end);
1065 return ret;
1066}
1067
1068static unsigned long zap_pte_range(struct mmu_gather *tlb,
1069 struct vm_area_struct *vma, pmd_t *pmd,
1070 unsigned long addr, unsigned long end,
1071 struct zap_details *details)
1072{
1073 struct mm_struct *mm = tlb->mm;
1074 int force_flush = 0;
1075 int rss[NR_MM_COUNTERS];
1076 spinlock_t *ptl;
1077 pte_t *start_pte;
1078 pte_t *pte;
1079
1080again:
1081 init_rss_vec(rss);
1082 start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
1083 pte = start_pte;
1084 arch_enter_lazy_mmu_mode();
1085 do {
1086 pte_t ptent = *pte;
1087 if (pte_none(ptent)) {
1088 continue;
1089 }
1090
1091 if (pte_present(ptent)) {
1092 struct page *page;
1093
1094 page = vm_normal_page(vma, addr, ptent);
1095 if (unlikely(details) && page) {
1096
1097
1098
1099
1100
1101 if (details->check_mapping &&
1102 details->check_mapping != page->mapping)
1103 continue;
1104
1105
1106
1107
1108 if (details->nonlinear_vma &&
1109 (page->index < details->first_index ||
1110 page->index > details->last_index))
1111 continue;
1112 }
1113 ptent = ptep_get_and_clear_full(mm, addr, pte,
1114 tlb->fullmm);
1115 tlb_remove_tlb_entry(tlb, pte, addr);
1116 if (unlikely(!page))
1117 continue;
1118 if (unlikely(details) && details->nonlinear_vma
1119 && linear_page_index(details->nonlinear_vma,
1120 addr) != page->index) {
1121 pte_t ptfile = pgoff_to_pte(page->index);
1122 if (pte_soft_dirty(ptent))
1123 pte_file_mksoft_dirty(ptfile);
1124 set_pte_at(mm, addr, pte, ptfile);
1125 }
1126 if (PageAnon(page))
1127 rss[MM_ANONPAGES]--;
1128 else {
1129 if (pte_dirty(ptent))
1130 set_page_dirty(page);
1131 if (pte_young(ptent) &&
1132 likely(!(vma->vm_flags & VM_SEQ_READ)))
1133 mark_page_accessed(page);
1134 rss[MM_FILEPAGES]--;
1135 }
1136 page_remove_rmap(page);
1137 if (unlikely(page_mapcount(page) < 0))
1138 print_bad_pte(vma, addr, ptent, page);
1139 force_flush = !__tlb_remove_page(tlb, page);
1140 if (force_flush)
1141 break;
1142 continue;
1143 }
1144
1145
1146
1147
1148 if (unlikely(details))
1149 continue;
1150 if (pte_file(ptent)) {
1151 if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
1152 print_bad_pte(vma, addr, ptent, NULL);
1153 } else {
1154 swp_entry_t entry = pte_to_swp_entry(ptent);
1155
1156 if (!non_swap_entry(entry))
1157 rss[MM_SWAPENTS]--;
1158 else if (is_migration_entry(entry)) {
1159 struct page *page;
1160
1161 page = migration_entry_to_page(entry);
1162
1163 if (PageAnon(page))
1164 rss[MM_ANONPAGES]--;
1165 else
1166 rss[MM_FILEPAGES]--;
1167 }
1168 if (unlikely(!free_swap_and_cache(entry)))
1169 print_bad_pte(vma, addr, ptent, NULL);
1170 }
1171 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
1172 } while (pte++, addr += PAGE_SIZE, addr != end);
1173
1174 add_mm_rss_vec(mm, rss);
1175 arch_leave_lazy_mmu_mode();
1176 pte_unmap_unlock(start_pte, ptl);
1177
1178
1179
1180
1181
1182
1183 if (force_flush) {
1184 unsigned long old_end;
1185
1186 force_flush = 0;
1187
1188
1189
1190
1191
1192
1193 old_end = tlb->end;
1194 tlb->end = addr;
1195
1196 tlb_flush_mmu(tlb);
1197
1198 tlb->start = addr;
1199 tlb->end = old_end;
1200
1201 if (addr != end)
1202 goto again;
1203 }
1204
1205 return addr;
1206}
1207
1208static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1209 struct vm_area_struct *vma, pud_t *pud,
1210 unsigned long addr, unsigned long end,
1211 struct zap_details *details)
1212{
1213 pmd_t *pmd;
1214 unsigned long next;
1215
1216 pmd = pmd_offset(pud, addr);
1217 do {
1218 next = pmd_addr_end(addr, end);
1219 if (pmd_trans_huge(*pmd)) {
1220 if (next - addr != HPAGE_PMD_SIZE) {
1221#ifdef CONFIG_DEBUG_VM
1222 if (!rwsem_is_locked(&tlb->mm->mmap_sem)) {
1223 pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n",
1224 __func__, addr, end,
1225 vma->vm_start,
1226 vma->vm_end);
1227 BUG();
1228 }
1229#endif
1230 split_huge_page_pmd(vma, addr, pmd);
1231 } else if (zap_huge_pmd(tlb, vma, pmd, addr))
1232 goto next;
1233
1234 }
1235
1236
1237
1238
1239
1240
1241
1242 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
1243 goto next;
1244 next = zap_pte_range(tlb, vma, pmd, addr, next, details);
1245next:
1246 cond_resched();
1247 } while (pmd++, addr = next, addr != end);
1248
1249 return addr;
1250}
1251
1252static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1253 struct vm_area_struct *vma, pgd_t *pgd,
1254 unsigned long addr, unsigned long end,
1255 struct zap_details *details)
1256{
1257 pud_t *pud;
1258 unsigned long next;
1259
1260 pud = pud_offset(pgd, addr);
1261 do {
1262 next = pud_addr_end(addr, end);
1263 if (pud_none_or_clear_bad(pud))
1264 continue;
1265 next = zap_pmd_range(tlb, vma, pud, addr, next, details);
1266 } while (pud++, addr = next, addr != end);
1267
1268 return addr;
1269}
1270
1271static void unmap_page_range(struct mmu_gather *tlb,
1272 struct vm_area_struct *vma,
1273 unsigned long addr, unsigned long end,
1274 struct zap_details *details)
1275{
1276 pgd_t *pgd;
1277 unsigned long next;
1278
1279 if (details && !details->check_mapping && !details->nonlinear_vma)
1280 details = NULL;
1281
1282 BUG_ON(addr >= end);
1283 mem_cgroup_uncharge_start();
1284 tlb_start_vma(tlb, vma);
1285 pgd = pgd_offset(vma->vm_mm, addr);
1286 do {
1287 next = pgd_addr_end(addr, end);
1288 if (pgd_none_or_clear_bad(pgd))
1289 continue;
1290 next = zap_pud_range(tlb, vma, pgd, addr, next, details);
1291 } while (pgd++, addr = next, addr != end);
1292 tlb_end_vma(tlb, vma);
1293 mem_cgroup_uncharge_end();
1294}
1295
1296
1297static void unmap_single_vma(struct mmu_gather *tlb,
1298 struct vm_area_struct *vma, unsigned long start_addr,
1299 unsigned long end_addr,
1300 struct zap_details *details)
1301{
1302 unsigned long start = max(vma->vm_start, start_addr);
1303 unsigned long end;
1304
1305 if (start >= vma->vm_end)
1306 return;
1307 end = min(vma->vm_end, end_addr);
1308 if (end <= vma->vm_start)
1309 return;
1310
1311 if (vma->vm_file)
1312 uprobe_munmap(vma, start, end);
1313
1314 if (unlikely(vma->vm_flags & VM_PFNMAP))
1315 untrack_pfn(vma, 0, 0);
1316
1317 if (start != end) {
1318 if (unlikely(is_vm_hugetlb_page(vma))) {
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330 if (vma->vm_file) {
1331 mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
1332 __unmap_hugepage_range_final(tlb, vma, start, end, NULL);
1333 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
1334 }
1335 } else
1336 unmap_page_range(tlb, vma, start, end, details);
1337 }
1338}
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358void unmap_vmas(struct mmu_gather *tlb,
1359 struct vm_area_struct *vma, unsigned long start_addr,
1360 unsigned long end_addr)
1361{
1362 struct mm_struct *mm = vma->vm_mm;
1363
1364 mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
1365 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
1366 unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
1367 mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
1368}
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379void zap_page_range(struct vm_area_struct *vma, unsigned long start,
1380 unsigned long size, struct zap_details *details)
1381{
1382 struct mm_struct *mm = vma->vm_mm;
1383 struct mmu_gather tlb;
1384 unsigned long end = start + size;
1385
1386 lru_add_drain();
1387 tlb_gather_mmu(&tlb, mm, start, end);
1388 update_hiwater_rss(mm);
1389 mmu_notifier_invalidate_range_start(mm, start, end);
1390 for ( ; vma && vma->vm_start < end; vma = vma->vm_next)
1391 unmap_single_vma(&tlb, vma, start, end, details);
1392 mmu_notifier_invalidate_range_end(mm, start, end);
1393 tlb_finish_mmu(&tlb, start, end);
1394}
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
1406 unsigned long size, struct zap_details *details)
1407{
1408 struct mm_struct *mm = vma->vm_mm;
1409 struct mmu_gather tlb;
1410 unsigned long end = address + size;
1411
1412 lru_add_drain();
1413 tlb_gather_mmu(&tlb, mm, address, end);
1414 update_hiwater_rss(mm);
1415 mmu_notifier_invalidate_range_start(mm, address, end);
1416 unmap_single_vma(&tlb, vma, address, end, details);
1417 mmu_notifier_invalidate_range_end(mm, address, end);
1418 tlb_finish_mmu(&tlb, address, end);
1419}
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
1434 unsigned long size)
1435{
1436 if (address < vma->vm_start || address + size > vma->vm_end ||
1437 !(vma->vm_flags & VM_PFNMAP))
1438 return -1;
1439 zap_page_range_single(vma, address, size, NULL);
1440 return 0;
1441}
1442EXPORT_SYMBOL_GPL(zap_vma_ptes);
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457struct page *follow_page_mask(struct vm_area_struct *vma,
1458 unsigned long address, unsigned int flags,
1459 unsigned int *page_mask)
1460{
1461 pgd_t *pgd;
1462 pud_t *pud;
1463 pmd_t *pmd;
1464 pte_t *ptep, pte;
1465 spinlock_t *ptl;
1466 struct page *page;
1467 struct mm_struct *mm = vma->vm_mm;
1468
1469 *page_mask = 0;
1470
1471 page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
1472 if (!IS_ERR(page)) {
1473 BUG_ON(flags & FOLL_GET);
1474 goto out;
1475 }
1476
1477 page = NULL;
1478 pgd = pgd_offset(mm, address);
1479 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
1480 goto no_page_table;
1481
1482 pud = pud_offset(pgd, address);
1483 if (pud_none(*pud))
1484 goto no_page_table;
1485 if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
1486 if (flags & FOLL_GET)
1487 goto out;
1488 page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
1489 goto out;
1490 }
1491 if (unlikely(pud_bad(*pud)))
1492 goto no_page_table;
1493
1494 pmd = pmd_offset(pud, address);
1495 if (pmd_none(*pmd))
1496 goto no_page_table;
1497 if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
1498 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
1499 if (flags & FOLL_GET) {
1500
1501
1502
1503
1504
1505 if (PageHead(page))
1506 get_page(page);
1507 else {
1508 page = NULL;
1509 goto out;
1510 }
1511 }
1512 goto out;
1513 }
1514 if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
1515 goto no_page_table;
1516 if (pmd_trans_huge(*pmd)) {
1517 if (flags & FOLL_SPLIT) {
1518 split_huge_page_pmd(vma, address, pmd);
1519 goto split_fallthrough;
1520 }
1521 spin_lock(&mm->page_table_lock);
1522 if (likely(pmd_trans_huge(*pmd))) {
1523 if (unlikely(pmd_trans_splitting(*pmd))) {
1524 spin_unlock(&mm->page_table_lock);
1525 wait_split_huge_page(vma->anon_vma, pmd);
1526 } else {
1527 page = follow_trans_huge_pmd(vma, address,
1528 pmd, flags);
1529 spin_unlock(&mm->page_table_lock);
1530 *page_mask = HPAGE_PMD_NR - 1;
1531 goto out;
1532 }
1533 } else
1534 spin_unlock(&mm->page_table_lock);
1535
1536 }
1537split_fallthrough:
1538 if (unlikely(pmd_bad(*pmd)))
1539 goto no_page_table;
1540
1541 ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
1542
1543 pte = *ptep;
1544 if (!pte_present(pte)) {
1545 swp_entry_t entry;
1546
1547
1548
1549
1550
1551 if (likely(!(flags & FOLL_MIGRATION)))
1552 goto no_page;
1553 if (pte_none(pte) || pte_file(pte))
1554 goto no_page;
1555 entry = pte_to_swp_entry(pte);
1556 if (!is_migration_entry(entry))
1557 goto no_page;
1558 pte_unmap_unlock(ptep, ptl);
1559 migration_entry_wait(mm, pmd, address);
1560 goto split_fallthrough;
1561 }
1562 if ((flags & FOLL_NUMA) && pte_numa(pte))
1563 goto no_page;
1564 if ((flags & FOLL_WRITE) && !pte_write(pte))
1565 goto unlock;
1566
1567 page = vm_normal_page(vma, address, pte);
1568 if (unlikely(!page)) {
1569 if ((flags & FOLL_DUMP) ||
1570 !is_zero_pfn(pte_pfn(pte)))
1571 goto bad_page;
1572 page = pte_page(pte);
1573 }
1574
1575 if (flags & FOLL_GET)
1576 get_page_foll(page);
1577 if (flags & FOLL_TOUCH) {
1578 if ((flags & FOLL_WRITE) &&
1579 !pte_dirty(pte) && !PageDirty(page))
1580 set_page_dirty(page);
1581
1582
1583
1584
1585
1586 mark_page_accessed(page);
1587 }
1588 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598 if (page->mapping && trylock_page(page)) {
1599 lru_add_drain();
1600
1601
1602
1603
1604
1605
1606 mlock_vma_page(page);
1607 unlock_page(page);
1608 }
1609 }
1610unlock:
1611 pte_unmap_unlock(ptep, ptl);
1612out:
1613 return page;
1614
1615bad_page:
1616 pte_unmap_unlock(ptep, ptl);
1617 return ERR_PTR(-EFAULT);
1618
1619no_page:
1620 pte_unmap_unlock(ptep, ptl);
1621 if (!pte_none(pte))
1622 return page;
1623
1624no_page_table:
1625
1626
1627
1628
1629
1630
1631
1632
1633 if ((flags & FOLL_DUMP) &&
1634 (!vma->vm_ops || !vma->vm_ops->fault))
1635 return ERR_PTR(-EFAULT);
1636 return page;
1637}
1638
1639static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr)
1640{
1641 return stack_guard_page_start(vma, addr) ||
1642 stack_guard_page_end(vma, addr+PAGE_SIZE);
1643}
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1695 unsigned long start, unsigned long nr_pages,
1696 unsigned int gup_flags, struct page **pages,
1697 struct vm_area_struct **vmas, int *nonblocking)
1698{
1699 long i;
1700 unsigned long vm_flags;
1701 unsigned int page_mask;
1702
1703 if (!nr_pages)
1704 return 0;
1705
1706 VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
1707
1708
1709
1710
1711
1712 vm_flags = (gup_flags & FOLL_WRITE) ?
1713 (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
1714 vm_flags &= (gup_flags & FOLL_FORCE) ?
1715 (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726 if (!(gup_flags & FOLL_FORCE))
1727 gup_flags |= FOLL_NUMA;
1728
1729 i = 0;
1730
1731 do {
1732 struct vm_area_struct *vma;
1733
1734 vma = find_extend_vma(mm, start);
1735 if (!vma && in_gate_area(mm, start)) {
1736 unsigned long pg = start & PAGE_MASK;
1737 pgd_t *pgd;
1738 pud_t *pud;
1739 pmd_t *pmd;
1740 pte_t *pte;
1741
1742
1743 if (gup_flags & FOLL_WRITE)
1744 return i ? : -EFAULT;
1745 if (pg > TASK_SIZE)
1746 pgd = pgd_offset_k(pg);
1747 else
1748 pgd = pgd_offset_gate(mm, pg);
1749 BUG_ON(pgd_none(*pgd));
1750 pud = pud_offset(pgd, pg);
1751 BUG_ON(pud_none(*pud));
1752 pmd = pmd_offset(pud, pg);
1753 if (pmd_none(*pmd))
1754 return i ? : -EFAULT;
1755 VM_BUG_ON(pmd_trans_huge(*pmd));
1756 pte = pte_offset_map(pmd, pg);
1757 if (pte_none(*pte)) {
1758 pte_unmap(pte);
1759 return i ? : -EFAULT;
1760 }
1761 vma = get_gate_vma(mm);
1762 if (pages) {
1763 struct page *page;
1764
1765 page = vm_normal_page(vma, start, *pte);
1766 if (!page) {
1767 if (!(gup_flags & FOLL_DUMP) &&
1768 is_zero_pfn(pte_pfn(*pte)))
1769 page = pte_page(*pte);
1770 else {
1771 pte_unmap(pte);
1772 return i ? : -EFAULT;
1773 }
1774 }
1775 pages[i] = page;
1776 get_page(page);
1777 }
1778 pte_unmap(pte);
1779 page_mask = 0;
1780 goto next_page;
1781 }
1782
1783 if (!vma ||
1784 (vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
1785 !(vm_flags & vma->vm_flags))
1786 return i ? : -EFAULT;
1787
1788 if (is_vm_hugetlb_page(vma)) {
1789 i = follow_hugetlb_page(mm, vma, pages, vmas,
1790 &start, &nr_pages, i, gup_flags);
1791 continue;
1792 }
1793
1794 do {
1795 struct page *page;
1796 unsigned int foll_flags = gup_flags;
1797 unsigned int page_increm;
1798
1799
1800
1801
1802
1803 if (unlikely(fatal_signal_pending(current)))
1804 return i ? i : -ERESTARTSYS;
1805
1806 cond_resched();
1807 while (!(page = follow_page_mask(vma, start,
1808 foll_flags, &page_mask))) {
1809 int ret;
1810 unsigned int fault_flags = 0;
1811
1812
1813 if (foll_flags & FOLL_MLOCK) {
1814 if (stack_guard_page(vma, start))
1815 goto next_page;
1816 }
1817 if (foll_flags & FOLL_WRITE)
1818 fault_flags |= FAULT_FLAG_WRITE;
1819 if (nonblocking)
1820 fault_flags |= FAULT_FLAG_ALLOW_RETRY;
1821 if (foll_flags & FOLL_NOWAIT)
1822 fault_flags |= (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT);
1823
1824 ret = handle_mm_fault(mm, vma, start,
1825 fault_flags);
1826
1827 if (ret & VM_FAULT_ERROR) {
1828 if (ret & VM_FAULT_OOM)
1829 return i ? i : -ENOMEM;
1830 if (ret & (VM_FAULT_HWPOISON |
1831 VM_FAULT_HWPOISON_LARGE)) {
1832 if (i)
1833 return i;
1834 else if (gup_flags & FOLL_HWPOISON)
1835 return -EHWPOISON;
1836 else
1837 return -EFAULT;
1838 }
1839 if (ret & VM_FAULT_SIGBUS)
1840 return i ? i : -EFAULT;
1841 BUG();
1842 }
1843
1844 if (tsk) {
1845 if (ret & VM_FAULT_MAJOR)
1846 tsk->maj_flt++;
1847 else
1848 tsk->min_flt++;
1849 }
1850
1851 if (ret & VM_FAULT_RETRY) {
1852 if (nonblocking)
1853 *nonblocking = 0;
1854 return i;
1855 }
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869 if ((ret & VM_FAULT_WRITE) &&
1870 !(vma->vm_flags & VM_WRITE))
1871 foll_flags &= ~FOLL_WRITE;
1872
1873 cond_resched();
1874 }
1875 if (IS_ERR(page))
1876 return i ? i : PTR_ERR(page);
1877 if (pages) {
1878 pages[i] = page;
1879
1880 flush_anon_page(vma, page, start);
1881 flush_dcache_page(page);
1882 page_mask = 0;
1883 }
1884next_page:
1885 if (vmas) {
1886 vmas[i] = vma;
1887 page_mask = 0;
1888 }
1889 page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
1890 if (page_increm > nr_pages)
1891 page_increm = nr_pages;
1892 i += page_increm;
1893 start += page_increm * PAGE_SIZE;
1894 nr_pages -= page_increm;
1895 } while (nr_pages && start < vma->vm_end);
1896 } while (nr_pages);
1897 return i;
1898}
1899EXPORT_SYMBOL(__get_user_pages);
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
1929 unsigned long address, unsigned int fault_flags)
1930{
1931 struct vm_area_struct *vma;
1932 int ret;
1933
1934 vma = find_extend_vma(mm, address);
1935 if (!vma || address < vma->vm_start)
1936 return -EFAULT;
1937
1938 ret = handle_mm_fault(mm, vma, address, fault_flags);
1939 if (ret & VM_FAULT_ERROR) {
1940 if (ret & VM_FAULT_OOM)
1941 return -ENOMEM;
1942 if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
1943 return -EHWPOISON;
1944 if (ret & VM_FAULT_SIGBUS)
1945 return -EFAULT;
1946 BUG();
1947 }
1948 if (tsk) {
1949 if (ret & VM_FAULT_MAJOR)
1950 tsk->maj_flt++;
1951 else
1952 tsk->min_flt++;
1953 }
1954 return 0;
1955}
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
2009 unsigned long start, unsigned long nr_pages, int write,
2010 int force, struct page **pages, struct vm_area_struct **vmas)
2011{
2012 int flags = FOLL_TOUCH;
2013
2014 if (pages)
2015 flags |= FOLL_GET;
2016 if (write)
2017 flags |= FOLL_WRITE;
2018 if (force)
2019 flags |= FOLL_FORCE;
2020
2021 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
2022 NULL);
2023}
2024EXPORT_SYMBOL(get_user_pages);
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040#ifdef CONFIG_ELF_CORE
2041struct page *get_dump_page(unsigned long addr)
2042{
2043 struct vm_area_struct *vma;
2044 struct page *page;
2045
2046 if (__get_user_pages(current, current->mm, addr, 1,
2047 FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
2048 NULL) < 1)
2049 return NULL;
2050 flush_cache_page(vma, addr, page_to_pfn(page));
2051 return page;
2052}
2053#endif
2054
2055pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
2056 spinlock_t **ptl)
2057{
2058 pgd_t * pgd = pgd_offset(mm, addr);
2059 pud_t * pud = pud_alloc(mm, pgd, addr);
2060 if (pud) {
2061 pmd_t * pmd = pmd_alloc(mm, pud, addr);
2062 if (pmd) {
2063 VM_BUG_ON(pmd_trans_huge(*pmd));
2064 return pte_alloc_map_lock(mm, pmd, addr, ptl);
2065 }
2066 }
2067 return NULL;
2068}
2069
2070
2071
2072
2073
2074
2075
2076
2077static int insert_page(struct vm_area_struct *vma, unsigned long addr,
2078 struct page *page, pgprot_t prot)
2079{
2080 struct mm_struct *mm = vma->vm_mm;
2081 int retval;
2082 pte_t *pte;
2083 spinlock_t *ptl;
2084
2085 retval = -EINVAL;
2086 if (PageAnon(page))
2087 goto out;
2088 retval = -ENOMEM;
2089 flush_dcache_page(page);
2090 pte = get_locked_pte(mm, addr, &ptl);
2091 if (!pte)
2092 goto out;
2093 retval = -EBUSY;
2094 if (!pte_none(*pte))
2095 goto out_unlock;
2096
2097
2098 get_page(page);
2099 inc_mm_counter_fast(mm, MM_FILEPAGES);
2100 page_add_file_rmap(page);
2101 set_pte_at(mm, addr, pte, mk_pte(page, prot));
2102
2103 retval = 0;
2104 pte_unmap_unlock(pte, ptl);
2105 return retval;
2106out_unlock:
2107 pte_unmap_unlock(pte, ptl);
2108out:
2109 return retval;
2110}
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
2140 struct page *page)
2141{
2142 if (addr < vma->vm_start || addr >= vma->vm_end)
2143 return -EFAULT;
2144 if (!page_count(page))
2145 return -EINVAL;
2146 if (!(vma->vm_flags & VM_MIXEDMAP)) {
2147 BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem));
2148 BUG_ON(vma->vm_flags & VM_PFNMAP);
2149 vma->vm_flags |= VM_MIXEDMAP;
2150 }
2151 return insert_page(vma, addr, page, vma->vm_page_prot);
2152}
2153EXPORT_SYMBOL(vm_insert_page);
2154
2155static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
2156 unsigned long pfn, pgprot_t prot)
2157{
2158 struct mm_struct *mm = vma->vm_mm;
2159 int retval;
2160 pte_t *pte, entry;
2161 spinlock_t *ptl;
2162
2163 retval = -ENOMEM;
2164 pte = get_locked_pte(mm, addr, &ptl);
2165 if (!pte)
2166 goto out;
2167 retval = -EBUSY;
2168 if (!pte_none(*pte))
2169 goto out_unlock;
2170
2171
2172 entry = pte_mkspecial(pfn_pte(pfn, prot));
2173 set_pte_at(mm, addr, pte, entry);
2174 update_mmu_cache(vma, addr, pte);
2175
2176 retval = 0;
2177out_unlock:
2178 pte_unmap_unlock(pte, ptl);
2179out:
2180 return retval;
2181}
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
2201 unsigned long pfn)
2202{
2203 int ret;
2204 pgprot_t pgprot = vma->vm_page_prot;
2205
2206
2207
2208
2209
2210
2211 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
2212 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
2213 (VM_PFNMAP|VM_MIXEDMAP));
2214 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
2215 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
2216
2217 if (addr < vma->vm_start || addr >= vma->vm_end)
2218 return -EFAULT;
2219 if (track_pfn_insert(vma, &pgprot, pfn))
2220 return -EINVAL;
2221
2222 ret = insert_pfn(vma, addr, pfn, pgprot);
2223
2224 return ret;
2225}
2226EXPORT_SYMBOL(vm_insert_pfn);
2227
2228int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
2229 unsigned long pfn)
2230{
2231 BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
2232
2233 if (addr < vma->vm_start || addr >= vma->vm_end)
2234 return -EFAULT;
2235
2236
2237
2238
2239
2240
2241
2242
2243 if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) {
2244 struct page *page;
2245
2246 page = pfn_to_page(pfn);
2247 return insert_page(vma, addr, page, vma->vm_page_prot);
2248 }
2249 return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
2250}
2251EXPORT_SYMBOL(vm_insert_mixed);
2252
2253
2254
2255
2256
2257
2258static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
2259 unsigned long addr, unsigned long end,
2260 unsigned long pfn, pgprot_t prot)
2261{
2262 pte_t *pte;
2263 spinlock_t *ptl;
2264
2265 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
2266 if (!pte)
2267 return -ENOMEM;
2268 arch_enter_lazy_mmu_mode();
2269 do {
2270 BUG_ON(!pte_none(*pte));
2271 set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
2272 pfn++;
2273 } while (pte++, addr += PAGE_SIZE, addr != end);
2274 arch_leave_lazy_mmu_mode();
2275 pte_unmap_unlock(pte - 1, ptl);
2276 return 0;
2277}
2278
2279static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
2280 unsigned long addr, unsigned long end,
2281 unsigned long pfn, pgprot_t prot)
2282{
2283 pmd_t *pmd;
2284 unsigned long next;
2285
2286 pfn -= addr >> PAGE_SHIFT;
2287 pmd = pmd_alloc(mm, pud, addr);
2288 if (!pmd)
2289 return -ENOMEM;
2290 VM_BUG_ON(pmd_trans_huge(*pmd));
2291 do {
2292 next = pmd_addr_end(addr, end);
2293 if (remap_pte_range(mm, pmd, addr, next,
2294 pfn + (addr >> PAGE_SHIFT), prot))
2295 return -ENOMEM;
2296 } while (pmd++, addr = next, addr != end);
2297 return 0;
2298}
2299
2300static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
2301 unsigned long addr, unsigned long end,
2302 unsigned long pfn, pgprot_t prot)
2303{
2304 pud_t *pud;
2305 unsigned long next;
2306
2307 pfn -= addr >> PAGE_SHIFT;
2308 pud = pud_alloc(mm, pgd, addr);
2309 if (!pud)
2310 return -ENOMEM;
2311 do {
2312 next = pud_addr_end(addr, end);
2313 if (remap_pmd_range(mm, pud, addr, next,
2314 pfn + (addr >> PAGE_SHIFT), prot))
2315 return -ENOMEM;
2316 } while (pud++, addr = next, addr != end);
2317 return 0;
2318}
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
2331 unsigned long pfn, unsigned long size, pgprot_t prot)
2332{
2333 pgd_t *pgd;
2334 unsigned long next;
2335 unsigned long end = addr + PAGE_ALIGN(size);
2336 struct mm_struct *mm = vma->vm_mm;
2337 int err;
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357 if (is_cow_mapping(vma->vm_flags)) {
2358 if (addr != vma->vm_start || end != vma->vm_end)
2359 return -EINVAL;
2360 vma->vm_pgoff = pfn;
2361 }
2362
2363 err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));
2364 if (err)
2365 return -EINVAL;
2366
2367 vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
2368
2369 BUG_ON(addr >= end);
2370 pfn -= addr >> PAGE_SHIFT;
2371 pgd = pgd_offset(mm, addr);
2372 flush_cache_range(vma, addr, end);
2373 do {
2374 next = pgd_addr_end(addr, end);
2375 err = remap_pud_range(mm, pgd, addr, next,
2376 pfn + (addr >> PAGE_SHIFT), prot);
2377 if (err)
2378 break;
2379 } while (pgd++, addr = next, addr != end);
2380
2381 if (err)
2382 untrack_pfn(vma, pfn, PAGE_ALIGN(size));
2383
2384 return err;
2385}
2386EXPORT_SYMBOL(remap_pfn_range);
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
2402{
2403 unsigned long vm_len, pfn, pages;
2404
2405
2406 if (start + len < start)
2407 return -EINVAL;
2408
2409
2410
2411
2412
2413 len += start & ~PAGE_MASK;
2414 pfn = start >> PAGE_SHIFT;
2415 pages = (len + ~PAGE_MASK) >> PAGE_SHIFT;
2416 if (pfn + pages < pfn)
2417 return -EINVAL;
2418
2419
2420 if (vma->vm_pgoff > pages)
2421 return -EINVAL;
2422 pfn += vma->vm_pgoff;
2423 pages -= vma->vm_pgoff;
2424
2425
2426 vm_len = vma->vm_end - vma->vm_start;
2427 if (vm_len >> PAGE_SHIFT > pages)
2428 return -EINVAL;
2429
2430
2431 return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
2432}
2433EXPORT_SYMBOL(vm_iomap_memory);
2434
2435static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
2436 unsigned long addr, unsigned long end,
2437 pte_fn_t fn, void *data)
2438{
2439 pte_t *pte;
2440 int err;
2441 pgtable_t token;
2442 spinlock_t *uninitialized_var(ptl);
2443
2444 pte = (mm == &init_mm) ?
2445 pte_alloc_kernel(pmd, addr) :
2446 pte_alloc_map_lock(mm, pmd, addr, &ptl);
2447 if (!pte)
2448 return -ENOMEM;
2449
2450 BUG_ON(pmd_huge(*pmd));
2451
2452 arch_enter_lazy_mmu_mode();
2453
2454 token = pmd_pgtable(*pmd);
2455
2456 do {
2457 err = fn(pte++, token, addr, data);
2458 if (err)
2459 break;
2460 } while (addr += PAGE_SIZE, addr != end);
2461
2462 arch_leave_lazy_mmu_mode();
2463
2464 if (mm != &init_mm)
2465 pte_unmap_unlock(pte-1, ptl);
2466 return err;
2467}
2468
2469static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
2470 unsigned long addr, unsigned long end,
2471 pte_fn_t fn, void *data)
2472{
2473 pmd_t *pmd;
2474 unsigned long next;
2475 int err;
2476
2477 BUG_ON(pud_huge(*pud));
2478
2479 pmd = pmd_alloc(mm, pud, addr);
2480 if (!pmd)
2481 return -ENOMEM;
2482 do {
2483 next = pmd_addr_end(addr, end);
2484 err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
2485 if (err)
2486 break;
2487 } while (pmd++, addr = next, addr != end);
2488 return err;
2489}
2490
2491static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd,
2492 unsigned long addr, unsigned long end,
2493 pte_fn_t fn, void *data)
2494{
2495 pud_t *pud;
2496 unsigned long next;
2497 int err;
2498
2499 pud = pud_alloc(mm, pgd, addr);
2500 if (!pud)
2501 return -ENOMEM;
2502 do {
2503 next = pud_addr_end(addr, end);
2504 err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
2505 if (err)
2506 break;
2507 } while (pud++, addr = next, addr != end);
2508 return err;
2509}
2510
2511
2512
2513
2514
2515int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
2516 unsigned long size, pte_fn_t fn, void *data)
2517{
2518 pgd_t *pgd;
2519 unsigned long next;
2520 unsigned long end = addr + size;
2521 int err;
2522
2523 BUG_ON(addr >= end);
2524 pgd = pgd_offset(mm, addr);
2525 do {
2526 next = pgd_addr_end(addr, end);
2527 err = apply_to_pud_range(mm, pgd, addr, next, fn, data);
2528 if (err)
2529 break;
2530 } while (pgd++, addr = next, addr != end);
2531
2532 return err;
2533}
2534EXPORT_SYMBOL_GPL(apply_to_page_range);
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
2546 pte_t *page_table, pte_t orig_pte)
2547{
2548 int same = 1;
2549#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
2550 if (sizeof(pte_t) > sizeof(unsigned long)) {
2551 spinlock_t *ptl = pte_lockptr(mm, pmd);
2552 spin_lock(ptl);
2553 same = pte_same(*page_table, orig_pte);
2554 spin_unlock(ptl);
2555 }
2556#endif
2557 pte_unmap(page_table);
2558 return same;
2559}
2560
2561static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
2562{
2563
2564
2565
2566
2567
2568
2569 if (unlikely(!src)) {
2570 void *kaddr = kmap_atomic(dst);
2571 void __user *uaddr = (void __user *)(va & PAGE_MASK);
2572
2573
2574
2575
2576
2577
2578
2579 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
2580 clear_page(kaddr);
2581 kunmap_atomic(kaddr);
2582 flush_dcache_page(dst);
2583 } else
2584 copy_user_highpage(dst, src, va, vma);
2585}
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2606 unsigned long address, pte_t *page_table, pmd_t *pmd,
2607 spinlock_t *ptl, pte_t orig_pte)
2608 __releases(ptl)
2609{
2610 struct page *old_page, *new_page = NULL;
2611 pte_t entry;
2612 int ret = 0;
2613 int page_mkwrite = 0;
2614 struct page *dirty_page = NULL;
2615 unsigned long mmun_start = 0;
2616 unsigned long mmun_end = 0;
2617
2618 old_page = vm_normal_page(vma, address, orig_pte);
2619 if (!old_page) {
2620
2621
2622
2623
2624
2625
2626
2627 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2628 (VM_WRITE|VM_SHARED))
2629 goto reuse;
2630 goto gotten;
2631 }
2632
2633
2634
2635
2636
2637 if (PageAnon(old_page) && !PageKsm(old_page)) {
2638 if (!trylock_page(old_page)) {
2639 page_cache_get(old_page);
2640 pte_unmap_unlock(page_table, ptl);
2641 lock_page(old_page);
2642 page_table = pte_offset_map_lock(mm, pmd, address,
2643 &ptl);
2644 if (!pte_same(*page_table, orig_pte)) {
2645 unlock_page(old_page);
2646 goto unlock;
2647 }
2648 page_cache_release(old_page);
2649 }
2650 if (reuse_swap_page(old_page)) {
2651
2652
2653
2654
2655
2656 page_move_anon_rmap(old_page, vma, address);
2657 unlock_page(old_page);
2658 goto reuse;
2659 }
2660 unlock_page(old_page);
2661 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2662 (VM_WRITE|VM_SHARED))) {
2663
2664
2665
2666
2667
2668 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
2669 struct vm_fault vmf;
2670 int tmp;
2671
2672 vmf.virtual_address = (void __user *)(address &
2673 PAGE_MASK);
2674 vmf.pgoff = old_page->index;
2675 vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
2676 vmf.page = old_page;
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686 page_cache_get(old_page);
2687 pte_unmap_unlock(page_table, ptl);
2688
2689 tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
2690 if (unlikely(tmp &
2691 (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
2692 ret = tmp;
2693 goto unwritable_page;
2694 }
2695 if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
2696 lock_page(old_page);
2697 if (!old_page->mapping) {
2698 ret = 0;
2699 unlock_page(old_page);
2700 goto unwritable_page;
2701 }
2702 } else
2703 VM_BUG_ON(!PageLocked(old_page));
2704
2705
2706
2707
2708
2709
2710
2711 page_table = pte_offset_map_lock(mm, pmd, address,
2712 &ptl);
2713 if (!pte_same(*page_table, orig_pte)) {
2714 unlock_page(old_page);
2715 goto unlock;
2716 }
2717
2718 page_mkwrite = 1;
2719 }
2720 dirty_page = old_page;
2721 get_page(dirty_page);
2722
2723reuse:
2724 flush_cache_page(vma, address, pte_pfn(orig_pte));
2725 entry = pte_mkyoung(orig_pte);
2726 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2727 if (ptep_set_access_flags(vma, address, page_table, entry,1))
2728 update_mmu_cache(vma, address, page_table);
2729 pte_unmap_unlock(page_table, ptl);
2730 ret |= VM_FAULT_WRITE;
2731
2732 if (!dirty_page)
2733 return ret;
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743 if (!page_mkwrite) {
2744 wait_on_page_locked(dirty_page);
2745 set_page_dirty_balance(dirty_page, page_mkwrite);
2746
2747 if (vma->vm_file)
2748 file_update_time(vma->vm_file);
2749 }
2750 put_page(dirty_page);
2751 if (page_mkwrite) {
2752 struct address_space *mapping = dirty_page->mapping;
2753
2754 set_page_dirty(dirty_page);
2755 unlock_page(dirty_page);
2756 page_cache_release(dirty_page);
2757 if (mapping) {
2758
2759
2760
2761
2762 balance_dirty_pages_ratelimited(mapping);
2763 }
2764 }
2765
2766 return ret;
2767 }
2768
2769
2770
2771
2772 page_cache_get(old_page);
2773gotten:
2774 pte_unmap_unlock(page_table, ptl);
2775
2776 if (unlikely(anon_vma_prepare(vma)))
2777 goto oom;
2778
2779 if (is_zero_pfn(pte_pfn(orig_pte))) {
2780 new_page = alloc_zeroed_user_highpage_movable(vma, address);
2781 if (!new_page)
2782 goto oom;
2783 } else {
2784 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
2785 if (!new_page)
2786 goto oom;
2787 cow_user_page(new_page, old_page, address, vma);
2788 }
2789 __SetPageUptodate(new_page);
2790
2791 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
2792 goto oom_free_new;
2793
2794 mmun_start = address & PAGE_MASK;
2795 mmun_end = mmun_start + PAGE_SIZE;
2796 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2797
2798
2799
2800
2801 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2802 if (likely(pte_same(*page_table, orig_pte))) {
2803 if (old_page) {
2804 if (!PageAnon(old_page)) {
2805 dec_mm_counter_fast(mm, MM_FILEPAGES);
2806 inc_mm_counter_fast(mm, MM_ANONPAGES);
2807 }
2808 } else
2809 inc_mm_counter_fast(mm, MM_ANONPAGES);
2810 flush_cache_page(vma, address, pte_pfn(orig_pte));
2811 entry = mk_pte(new_page, vma->vm_page_prot);
2812 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2813
2814
2815
2816
2817
2818
2819 ptep_clear_flush(vma, address, page_table);
2820 page_add_new_anon_rmap(new_page, vma, address);
2821
2822
2823
2824
2825
2826 set_pte_at_notify(mm, address, page_table, entry);
2827 update_mmu_cache(vma, address, page_table);
2828 if (old_page) {
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851 page_remove_rmap(old_page);
2852 }
2853
2854
2855 new_page = old_page;
2856 ret |= VM_FAULT_WRITE;
2857 } else
2858 mem_cgroup_uncharge_page(new_page);
2859
2860 if (new_page)
2861 page_cache_release(new_page);
2862unlock:
2863 pte_unmap_unlock(page_table, ptl);
2864 if (mmun_end > mmun_start)
2865 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2866 if (old_page) {
2867
2868
2869
2870
2871 if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) {
2872 lock_page(old_page);
2873 munlock_vma_page(old_page);
2874 unlock_page(old_page);
2875 }
2876 page_cache_release(old_page);
2877 }
2878 return ret;
2879oom_free_new:
2880 page_cache_release(new_page);
2881oom:
2882 if (old_page)
2883 page_cache_release(old_page);
2884 return VM_FAULT_OOM;
2885
2886unwritable_page:
2887 page_cache_release(old_page);
2888 return ret;
2889}
2890
2891static void unmap_mapping_range_vma(struct vm_area_struct *vma,
2892 unsigned long start_addr, unsigned long end_addr,
2893 struct zap_details *details)
2894{
2895 zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
2896}
2897
2898static inline void unmap_mapping_range_tree(struct rb_root *root,
2899 struct zap_details *details)
2900{
2901 struct vm_area_struct *vma;
2902 pgoff_t vba, vea, zba, zea;
2903
2904 vma_interval_tree_foreach(vma, root,
2905 details->first_index, details->last_index) {
2906
2907 vba = vma->vm_pgoff;
2908 vea = vba + vma_pages(vma) - 1;
2909
2910 zba = details->first_index;
2911 if (zba < vba)
2912 zba = vba;
2913 zea = details->last_index;
2914 if (zea > vea)
2915 zea = vea;
2916
2917 unmap_mapping_range_vma(vma,
2918 ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
2919 ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
2920 details);
2921 }
2922}
2923
2924static inline void unmap_mapping_range_list(struct list_head *head,
2925 struct zap_details *details)
2926{
2927 struct vm_area_struct *vma;
2928
2929
2930
2931
2932
2933
2934
2935 list_for_each_entry(vma, head, shared.nonlinear) {
2936 details->nonlinear_vma = vma;
2937 unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details);
2938 }
2939}
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955void unmap_mapping_range(struct address_space *mapping,
2956 loff_t const holebegin, loff_t const holelen, int even_cows)
2957{
2958 struct zap_details details;
2959 pgoff_t hba = holebegin >> PAGE_SHIFT;
2960 pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2961
2962
2963 if (sizeof(holelen) > sizeof(hlen)) {
2964 long long holeend =
2965 (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2966 if (holeend & ~(long long)ULONG_MAX)
2967 hlen = ULONG_MAX - hba + 1;
2968 }
2969
2970 details.check_mapping = even_cows? NULL: mapping;
2971 details.nonlinear_vma = NULL;
2972 details.first_index = hba;
2973 details.last_index = hba + hlen - 1;
2974 if (details.last_index < details.first_index)
2975 details.last_index = ULONG_MAX;
2976
2977
2978 mutex_lock(&mapping->i_mmap_mutex);
2979 if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
2980 unmap_mapping_range_tree(&mapping->i_mmap, &details);
2981 if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
2982 unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
2983 mutex_unlock(&mapping->i_mmap_mutex);
2984}
2985EXPORT_SYMBOL(unmap_mapping_range);
2986
2987
2988
2989
2990
2991
2992static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2993 unsigned long address, pte_t *page_table, pmd_t *pmd,
2994 unsigned int flags, pte_t orig_pte)
2995{
2996 spinlock_t *ptl;
2997 struct page *page, *swapcache;
2998 swp_entry_t entry;
2999 pte_t pte;
3000 int locked;
3001 struct mem_cgroup *ptr;
3002 int exclusive = 0;
3003 int ret = 0;
3004
3005 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
3006 goto out;
3007
3008 entry = pte_to_swp_entry(orig_pte);
3009 if (unlikely(non_swap_entry(entry))) {
3010 if (is_migration_entry(entry)) {
3011 migration_entry_wait(mm, pmd, address);
3012 } else if (is_hwpoison_entry(entry)) {
3013 ret = VM_FAULT_HWPOISON;
3014 } else {
3015 print_bad_pte(vma, address, orig_pte, NULL);
3016 ret = VM_FAULT_SIGBUS;
3017 }
3018 goto out;
3019 }
3020 delayacct_set_flag(DELAYACCT_PF_SWAPIN);
3021 page = lookup_swap_cache(entry);
3022 if (!page) {
3023 page = swapin_readahead(entry,
3024 GFP_HIGHUSER_MOVABLE, vma, address);
3025 if (!page) {
3026
3027
3028
3029
3030 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
3031 if (likely(pte_same(*page_table, orig_pte)))
3032 ret = VM_FAULT_OOM;
3033 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
3034 goto unlock;
3035 }
3036
3037
3038 ret = VM_FAULT_MAJOR;
3039 count_vm_event(PGMAJFAULT);
3040 mem_cgroup_count_vm_event(mm, PGMAJFAULT);
3041 } else if (PageHWPoison(page)) {
3042
3043
3044
3045
3046 ret = VM_FAULT_HWPOISON;
3047 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
3048 swapcache = page;
3049 goto out_release;
3050 }
3051
3052 swapcache = page;
3053 locked = lock_page_or_retry(page, mm, flags);
3054
3055 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
3056 if (!locked) {
3057 ret |= VM_FAULT_RETRY;
3058 goto out_release;
3059 }
3060
3061
3062
3063
3064
3065
3066
3067 if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
3068 goto out_page;
3069
3070 page = ksm_might_need_to_copy(page, vma, address);
3071 if (unlikely(!page)) {
3072 ret = VM_FAULT_OOM;
3073 page = swapcache;
3074 goto out_page;
3075 }
3076
3077 if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
3078 ret = VM_FAULT_OOM;
3079 goto out_page;
3080 }
3081
3082
3083
3084
3085 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
3086 if (unlikely(!pte_same(*page_table, orig_pte)))
3087 goto out_nomap;
3088
3089 if (unlikely(!PageUptodate(page))) {
3090 ret = VM_FAULT_SIGBUS;
3091 goto out_nomap;
3092 }
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108 inc_mm_counter_fast(mm, MM_ANONPAGES);
3109 dec_mm_counter_fast(mm, MM_SWAPENTS);
3110 pte = mk_pte(page, vma->vm_page_prot);
3111 if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
3112 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
3113 flags &= ~FAULT_FLAG_WRITE;
3114 ret |= VM_FAULT_WRITE;
3115 exclusive = 1;
3116 }
3117 flush_icache_page(vma, page);
3118 if (pte_swp_soft_dirty(orig_pte))
3119 pte = pte_mksoft_dirty(pte);
3120 set_pte_at(mm, address, page_table, pte);
3121 if (page == swapcache)
3122 do_page_add_anon_rmap(page, vma, address, exclusive);
3123 else
3124 page_add_new_anon_rmap(page, vma, address);
3125
3126 mem_cgroup_commit_charge_swapin(page, ptr);
3127
3128 swap_free(entry);
3129 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
3130 try_to_free_swap(page);
3131 unlock_page(page);
3132 if (page != swapcache) {
3133
3134
3135
3136
3137
3138
3139
3140
3141 unlock_page(swapcache);
3142 page_cache_release(swapcache);
3143 }
3144
3145 if (flags & FAULT_FLAG_WRITE) {
3146 ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
3147 if (ret & VM_FAULT_ERROR)
3148 ret &= VM_FAULT_ERROR;
3149 goto out;
3150 }
3151
3152
3153 update_mmu_cache(vma, address, page_table);
3154unlock:
3155 pte_unmap_unlock(page_table, ptl);
3156out:
3157 return ret;
3158out_nomap:
3159 mem_cgroup_cancel_charge_swapin(ptr);
3160 pte_unmap_unlock(page_table, ptl);
3161out_page:
3162 unlock_page(page);
3163out_release:
3164 page_cache_release(page);
3165 if (page != swapcache) {
3166 unlock_page(swapcache);
3167 page_cache_release(swapcache);
3168 }
3169 return ret;
3170}
3171
3172
3173
3174
3175
3176
3177static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address)
3178{
3179 address &= PAGE_MASK;
3180 if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) {
3181 struct vm_area_struct *prev = vma->vm_prev;
3182
3183
3184
3185
3186
3187
3188
3189 if (prev && prev->vm_end == address)
3190 return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM;
3191
3192 expand_downwards(vma, address - PAGE_SIZE);
3193 }
3194 if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) {
3195 struct vm_area_struct *next = vma->vm_next;
3196
3197
3198 if (next && next->vm_start == address + PAGE_SIZE)
3199 return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM;
3200
3201 expand_upwards(vma, address + PAGE_SIZE);
3202 }
3203 return 0;
3204}
3205
3206
3207
3208
3209
3210
3211static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
3212 unsigned long address, pte_t *page_table, pmd_t *pmd,
3213 unsigned int flags)
3214{
3215 struct page *page;
3216 spinlock_t *ptl;
3217 pte_t entry;
3218
3219 pte_unmap(page_table);
3220
3221
3222 if (check_stack_guard_page(vma, address) < 0)
3223 return VM_FAULT_SIGBUS;
3224
3225
3226 if (!(flags & FAULT_FLAG_WRITE)) {
3227 entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
3228 vma->vm_page_prot));
3229 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
3230 if (!pte_none(*page_table))
3231 goto unlock;
3232 goto setpte;
3233 }
3234
3235
3236 if (unlikely(anon_vma_prepare(vma)))
3237 goto oom;
3238 page = alloc_zeroed_user_highpage_movable(vma, address);
3239 if (!page)
3240 goto oom;
3241
3242
3243
3244
3245
3246 __SetPageUptodate(page);
3247
3248 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))
3249 goto oom_free_page;
3250
3251 entry = mk_pte(page, vma->vm_page_prot);
3252 if (vma->vm_flags & VM_WRITE)
3253 entry = pte_mkwrite(pte_mkdirty(entry));
3254
3255 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
3256 if (!pte_none(*page_table))
3257 goto release;
3258
3259 inc_mm_counter_fast(mm, MM_ANONPAGES);
3260 page_add_new_anon_rmap(page, vma, address);
3261setpte:
3262 set_pte_at(mm, address, page_table, entry);
3263
3264
3265 update_mmu_cache(vma, address, page_table);
3266unlock:
3267 pte_unmap_unlock(page_table, ptl);
3268 return 0;
3269release:
3270 mem_cgroup_uncharge_page(page);
3271 page_cache_release(page);
3272 goto unlock;
3273oom_free_page:
3274 page_cache_release(page);
3275oom:
3276 return VM_FAULT_OOM;
3277}
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3293 unsigned long address, pmd_t *pmd,
3294 pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
3295{
3296 pte_t *page_table;
3297 spinlock_t *ptl;
3298 struct page *page;
3299 struct page *cow_page;
3300 pte_t entry;
3301 int anon = 0;
3302 struct page *dirty_page = NULL;
3303 struct vm_fault vmf;
3304 int ret;
3305 int page_mkwrite = 0;
3306
3307
3308
3309
3310
3311 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
3312
3313 if (unlikely(anon_vma_prepare(vma)))
3314 return VM_FAULT_OOM;
3315
3316 cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
3317 if (!cow_page)
3318 return VM_FAULT_OOM;
3319
3320 if (mem_cgroup_newpage_charge(cow_page, mm, GFP_KERNEL)) {
3321 page_cache_release(cow_page);
3322 return VM_FAULT_OOM;
3323 }
3324 } else
3325 cow_page = NULL;
3326
3327 vmf.virtual_address = (void __user *)(address & PAGE_MASK);
3328 vmf.pgoff = pgoff;
3329 vmf.flags = flags;
3330 vmf.page = NULL;
3331
3332 ret = vma->vm_ops->fault(vma, &vmf);
3333 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
3334 VM_FAULT_RETRY)))
3335 goto uncharge_out;
3336
3337 if (unlikely(PageHWPoison(vmf.page))) {
3338 if (ret & VM_FAULT_LOCKED)
3339 unlock_page(vmf.page);
3340 ret = VM_FAULT_HWPOISON;
3341 goto uncharge_out;
3342 }
3343
3344
3345
3346
3347
3348 if (unlikely(!(ret & VM_FAULT_LOCKED)))
3349 lock_page(vmf.page);
3350 else
3351 VM_BUG_ON(!PageLocked(vmf.page));
3352
3353
3354
3355
3356 page = vmf.page;
3357 if (flags & FAULT_FLAG_WRITE) {
3358 if (!(vma->vm_flags & VM_SHARED)) {
3359 page = cow_page;
3360 anon = 1;
3361 copy_user_highpage(page, vmf.page, address, vma);
3362 __SetPageUptodate(page);
3363 } else {
3364
3365
3366
3367
3368
3369 if (vma->vm_ops->page_mkwrite) {
3370 int tmp;
3371
3372 unlock_page(page);
3373 vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
3374 tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
3375 if (unlikely(tmp &
3376 (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
3377 ret = tmp;
3378 goto unwritable_page;
3379 }
3380 if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
3381 lock_page(page);
3382 if (!page->mapping) {
3383 ret = 0;
3384 unlock_page(page);
3385 goto unwritable_page;
3386 }
3387 } else
3388 VM_BUG_ON(!PageLocked(page));
3389 page_mkwrite = 1;
3390 }
3391 }
3392
3393 }
3394
3395 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408 if (likely(pte_same(*page_table, orig_pte))) {
3409 flush_icache_page(vma, page);
3410 entry = mk_pte(page, vma->vm_page_prot);
3411 if (flags & FAULT_FLAG_WRITE)
3412 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
3413 else if (pte_file(orig_pte) && pte_file_soft_dirty(orig_pte))
3414 pte_mksoft_dirty(entry);
3415 if (anon) {
3416 inc_mm_counter_fast(mm, MM_ANONPAGES);
3417 page_add_new_anon_rmap(page, vma, address);
3418 } else {
3419 inc_mm_counter_fast(mm, MM_FILEPAGES);
3420 page_add_file_rmap(page);
3421 if (flags & FAULT_FLAG_WRITE) {
3422 dirty_page = page;
3423 get_page(dirty_page);
3424 }
3425 }
3426 set_pte_at(mm, address, page_table, entry);
3427
3428
3429 update_mmu_cache(vma, address, page_table);
3430 } else {
3431 if (cow_page)
3432 mem_cgroup_uncharge_page(cow_page);
3433 if (anon)
3434 page_cache_release(page);
3435 else
3436 anon = 1;
3437 }
3438
3439 pte_unmap_unlock(page_table, ptl);
3440
3441 if (dirty_page) {
3442 struct address_space *mapping = page->mapping;
3443 int dirtied = 0;
3444
3445 if (set_page_dirty(dirty_page))
3446 dirtied = 1;
3447 unlock_page(dirty_page);
3448 put_page(dirty_page);
3449 if ((dirtied || page_mkwrite) && mapping) {
3450
3451
3452
3453
3454 balance_dirty_pages_ratelimited(mapping);
3455 }
3456
3457
3458 if (vma->vm_file && !page_mkwrite)
3459 file_update_time(vma->vm_file);
3460 } else {
3461 unlock_page(vmf.page);
3462 if (anon)
3463 page_cache_release(vmf.page);
3464 }
3465
3466 return ret;
3467
3468unwritable_page:
3469 page_cache_release(page);
3470 return ret;
3471uncharge_out:
3472
3473 if (cow_page) {
3474 mem_cgroup_uncharge_page(cow_page);
3475 page_cache_release(cow_page);
3476 }
3477 return ret;
3478}
3479
3480static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3481 unsigned long address, pte_t *page_table, pmd_t *pmd,
3482 unsigned int flags, pte_t orig_pte)
3483{
3484 pgoff_t pgoff = (((address & PAGE_MASK)
3485 - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
3486
3487 pte_unmap(page_table);
3488 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
3489}
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3501 unsigned long address, pte_t *page_table, pmd_t *pmd,
3502 unsigned int flags, pte_t orig_pte)
3503{
3504 pgoff_t pgoff;
3505
3506 flags |= FAULT_FLAG_NONLINEAR;
3507
3508 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
3509 return 0;
3510
3511 if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
3512
3513
3514
3515 print_bad_pte(vma, address, orig_pte, NULL);
3516 return VM_FAULT_SIGBUS;
3517 }
3518
3519 pgoff = pte_to_pgoff(orig_pte);
3520 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
3521}
3522
3523int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
3524 unsigned long addr, int page_nid)
3525{
3526 get_page(page);
3527
3528 count_vm_numa_event(NUMA_HINT_FAULTS);
3529 if (page_nid == numa_node_id())
3530 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
3531
3532 return mpol_misplaced(page, vma, addr);
3533}
3534
3535int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3536 unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd)
3537{
3538 struct page *page = NULL;
3539 spinlock_t *ptl;
3540 int page_nid = -1;
3541 int target_nid;
3542 bool migrated = false;
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553 ptl = pte_lockptr(mm, pmd);
3554 spin_lock(ptl);
3555 if (unlikely(!pte_same(*ptep, pte))) {
3556 pte_unmap_unlock(ptep, ptl);
3557 goto out;
3558 }
3559
3560 pte = pte_mknonnuma(pte);
3561 set_pte_at(mm, addr, ptep, pte);
3562 update_mmu_cache(vma, addr, ptep);
3563
3564 page = vm_normal_page(vma, addr, pte);
3565 if (!page) {
3566 pte_unmap_unlock(ptep, ptl);
3567 return 0;
3568 }
3569
3570 page_nid = page_to_nid(page);
3571 target_nid = numa_migrate_prep(page, vma, addr, page_nid);
3572 pte_unmap_unlock(ptep, ptl);
3573 if (target_nid == -1) {
3574 put_page(page);
3575 goto out;
3576 }
3577
3578
3579 migrated = migrate_misplaced_page(page, target_nid);
3580 if (migrated)
3581 page_nid = target_nid;
3582
3583out:
3584 if (page_nid != -1)
3585 task_numa_fault(page_nid, 1, migrated);
3586 return 0;
3587}
3588
3589
3590#ifdef CONFIG_NUMA_BALANCING
3591static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3592 unsigned long addr, pmd_t *pmdp)
3593{
3594 pmd_t pmd;
3595 pte_t *pte, *orig_pte;
3596 unsigned long _addr = addr & PMD_MASK;
3597 unsigned long offset;
3598 spinlock_t *ptl;
3599 bool numa = false;
3600
3601 spin_lock(&mm->page_table_lock);
3602 pmd = *pmdp;
3603 if (pmd_numa(pmd)) {
3604 set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd));
3605 numa = true;
3606 }
3607 spin_unlock(&mm->page_table_lock);
3608
3609 if (!numa)
3610 return 0;
3611
3612
3613 BUG_ON(!vma);
3614 BUG_ON(vma->vm_start >= _addr + PMD_SIZE);
3615 offset = max(_addr, vma->vm_start) & ~PMD_MASK;
3616 VM_BUG_ON(offset >= PMD_SIZE);
3617 orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl);
3618 pte += offset >> PAGE_SHIFT;
3619 for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
3620 pte_t pteval = *pte;
3621 struct page *page;
3622 int page_nid = -1;
3623 int target_nid;
3624 bool migrated = false;
3625
3626 if (!pte_present(pteval))
3627 continue;
3628 if (!pte_numa(pteval))
3629 continue;
3630 if (addr >= vma->vm_end) {
3631 vma = find_vma(mm, addr);
3632
3633 BUG_ON(!vma);
3634 BUG_ON(addr < vma->vm_start);
3635 }
3636 if (pte_numa(pteval)) {
3637 pteval = pte_mknonnuma(pteval);
3638 set_pte_at(mm, addr, pte, pteval);
3639 }
3640 page = vm_normal_page(vma, addr, pteval);
3641 if (unlikely(!page))
3642 continue;
3643
3644 if (unlikely(page_mapcount(page) != 1))
3645 continue;
3646
3647 page_nid = page_to_nid(page);
3648 target_nid = numa_migrate_prep(page, vma, addr, page_nid);
3649 pte_unmap_unlock(pte, ptl);
3650 if (target_nid != -1) {
3651 migrated = migrate_misplaced_page(page, target_nid);
3652 if (migrated)
3653 page_nid = target_nid;
3654 } else {
3655 put_page(page);
3656 }
3657
3658 if (page_nid != -1)
3659 task_numa_fault(page_nid, 1, migrated);
3660
3661 pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
3662 }
3663 pte_unmap_unlock(orig_pte, ptl);
3664
3665 return 0;
3666}
3667#else
3668static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3669 unsigned long addr, pmd_t *pmdp)
3670{
3671 BUG();
3672 return 0;
3673}
3674#endif
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689static int handle_pte_fault(struct mm_struct *mm,
3690 struct vm_area_struct *vma, unsigned long address,
3691 pte_t *pte, pmd_t *pmd, unsigned int flags)
3692{
3693 pte_t entry;
3694 spinlock_t *ptl;
3695
3696 entry = *pte;
3697 if (!pte_present(entry)) {
3698 if (pte_none(entry)) {
3699 if (vma->vm_ops) {
3700 if (likely(vma->vm_ops->fault))
3701 return do_linear_fault(mm, vma, address,
3702 pte, pmd, flags, entry);
3703 }
3704 return do_anonymous_page(mm, vma, address,
3705 pte, pmd, flags);
3706 }
3707 if (pte_file(entry))
3708 return do_nonlinear_fault(mm, vma, address,
3709 pte, pmd, flags, entry);
3710 return do_swap_page(mm, vma, address,
3711 pte, pmd, flags, entry);
3712 }
3713
3714 if (pte_numa(entry))
3715 return do_numa_page(mm, vma, address, entry, pte, pmd);
3716
3717 ptl = pte_lockptr(mm, pmd);
3718 spin_lock(ptl);
3719 if (unlikely(!pte_same(*pte, entry)))
3720 goto unlock;
3721 if (flags & FAULT_FLAG_WRITE) {
3722 if (!pte_write(entry))
3723 return do_wp_page(mm, vma, address,
3724 pte, pmd, ptl, entry);
3725 entry = pte_mkdirty(entry);
3726 }
3727 entry = pte_mkyoung(entry);
3728 if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
3729 update_mmu_cache(vma, address, pte);
3730 } else {
3731
3732
3733
3734
3735
3736
3737 if (flags & FAULT_FLAG_WRITE)
3738 flush_tlb_fix_spurious_fault(vma, address);
3739 }
3740unlock:
3741 pte_unmap_unlock(pte, ptl);
3742 return 0;
3743}
3744
3745
3746
3747
3748static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3749 unsigned long address, unsigned int flags)
3750{
3751 pgd_t *pgd;
3752 pud_t *pud;
3753 pmd_t *pmd;
3754 pte_t *pte;
3755
3756 if (unlikely(is_vm_hugetlb_page(vma)))
3757 return hugetlb_fault(mm, vma, address, flags);
3758
3759retry:
3760 pgd = pgd_offset(mm, address);
3761 pud = pud_alloc(mm, pgd, address);
3762 if (!pud)
3763 return VM_FAULT_OOM;
3764 pmd = pmd_alloc(mm, pud, address);
3765 if (!pmd)
3766 return VM_FAULT_OOM;
3767 if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
3768 int ret = VM_FAULT_FALLBACK;
3769 if (!vma->vm_ops)
3770 ret = do_huge_pmd_anonymous_page(mm, vma, address,
3771 pmd, flags);
3772 if (!(ret & VM_FAULT_FALLBACK))
3773 return ret;
3774 } else {
3775 pmd_t orig_pmd = *pmd;
3776 int ret;
3777
3778 barrier();
3779 if (pmd_trans_huge(orig_pmd)) {
3780 unsigned int dirty = flags & FAULT_FLAG_WRITE;
3781
3782
3783
3784
3785
3786
3787 if (pmd_trans_splitting(orig_pmd))
3788 return 0;
3789
3790 if (pmd_numa(orig_pmd))
3791 return do_huge_pmd_numa_page(mm, vma, address,
3792 orig_pmd, pmd);
3793
3794 if (dirty && !pmd_write(orig_pmd)) {
3795 ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
3796 orig_pmd);
3797
3798
3799
3800
3801
3802 if (unlikely(ret & VM_FAULT_OOM))
3803 goto retry;
3804 return ret;
3805 } else {
3806 huge_pmd_set_accessed(mm, vma, address, pmd,
3807 orig_pmd, dirty);
3808 }
3809
3810 return 0;
3811 }
3812 }
3813
3814 if (pmd_numa(*pmd))
3815 return do_pmd_numa_page(mm, vma, address, pmd);
3816
3817
3818
3819
3820
3821
3822 if (unlikely(pmd_none(*pmd)) &&
3823 unlikely(__pte_alloc(mm, vma, pmd, address)))
3824 return VM_FAULT_OOM;
3825
3826 if (unlikely(pmd_trans_huge(*pmd)))
3827 return 0;
3828
3829
3830
3831
3832
3833
3834 pte = pte_offset_map(pmd, address);
3835
3836 return handle_pte_fault(mm, vma, address, pte, pmd, flags);
3837}
3838
3839int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3840 unsigned long address, unsigned int flags)
3841{
3842 int ret;
3843
3844 __set_current_state(TASK_RUNNING);
3845
3846 count_vm_event(PGFAULT);
3847 mem_cgroup_count_vm_event(mm, PGFAULT);
3848
3849
3850 check_sync_rss_stat(current);
3851
3852
3853
3854
3855
3856 if (flags & FAULT_FLAG_USER)
3857 mem_cgroup_oom_enable();
3858
3859 ret = __handle_mm_fault(mm, vma, address, flags);
3860
3861 if (flags & FAULT_FLAG_USER) {
3862 mem_cgroup_oom_disable();
3863
3864
3865
3866
3867
3868
3869 if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
3870 mem_cgroup_oom_synchronize(false);
3871 }
3872
3873 return ret;
3874}
3875
3876#ifndef __PAGETABLE_PUD_FOLDED
3877
3878
3879
3880
3881int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
3882{
3883 pud_t *new = pud_alloc_one(mm, address);
3884 if (!new)
3885 return -ENOMEM;
3886
3887 smp_wmb();
3888
3889 spin_lock(&mm->page_table_lock);
3890 if (pgd_present(*pgd))
3891 pud_free(mm, new);
3892 else
3893 pgd_populate(mm, pgd, new);
3894 spin_unlock(&mm->page_table_lock);
3895 return 0;
3896}
3897#endif
3898
3899#ifndef __PAGETABLE_PMD_FOLDED
3900
3901
3902
3903
3904int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
3905{
3906 pmd_t *new = pmd_alloc_one(mm, address);
3907 if (!new)
3908 return -ENOMEM;
3909
3910 smp_wmb();
3911
3912 spin_lock(&mm->page_table_lock);
3913#ifndef __ARCH_HAS_4LEVEL_HACK
3914 if (pud_present(*pud))
3915 pmd_free(mm, new);
3916 else
3917 pud_populate(mm, pud, new);
3918#else
3919 if (pgd_present(*pud))
3920 pmd_free(mm, new);
3921 else
3922 pgd_populate(mm, pud, new);
3923#endif
3924 spin_unlock(&mm->page_table_lock);
3925 return 0;
3926}
3927#endif
3928
3929#if !defined(__HAVE_ARCH_GATE_AREA)
3930
3931#if defined(AT_SYSINFO_EHDR)
3932static struct vm_area_struct gate_vma;
3933
3934static int __init gate_vma_init(void)
3935{
3936 gate_vma.vm_mm = NULL;
3937 gate_vma.vm_start = FIXADDR_USER_START;
3938 gate_vma.vm_end = FIXADDR_USER_END;
3939 gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
3940 gate_vma.vm_page_prot = __P101;
3941
3942 return 0;
3943}
3944__initcall(gate_vma_init);
3945#endif
3946
3947struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
3948{
3949#ifdef AT_SYSINFO_EHDR
3950 return &gate_vma;
3951#else
3952 return NULL;
3953#endif
3954}
3955
3956int in_gate_area_no_mm(unsigned long addr)
3957{
3958#ifdef AT_SYSINFO_EHDR
3959 if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END))
3960 return 1;
3961#endif
3962 return 0;
3963}
3964
3965#endif
3966
3967static int __follow_pte(struct mm_struct *mm, unsigned long address,
3968 pte_t **ptepp, spinlock_t **ptlp)
3969{
3970 pgd_t *pgd;
3971 pud_t *pud;
3972 pmd_t *pmd;
3973 pte_t *ptep;
3974
3975 pgd = pgd_offset(mm, address);
3976 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
3977 goto out;
3978
3979 pud = pud_offset(pgd, address);
3980 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
3981 goto out;
3982
3983 pmd = pmd_offset(pud, address);
3984 VM_BUG_ON(pmd_trans_huge(*pmd));
3985 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
3986 goto out;
3987
3988
3989 if (pmd_huge(*pmd))
3990 goto out;
3991
3992 ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
3993 if (!ptep)
3994 goto out;
3995 if (!pte_present(*ptep))
3996 goto unlock;
3997 *ptepp = ptep;
3998 return 0;
3999unlock:
4000 pte_unmap_unlock(ptep, *ptlp);
4001out:
4002 return -EINVAL;
4003}
4004
4005static inline int follow_pte(struct mm_struct *mm, unsigned long address,
4006 pte_t **ptepp, spinlock_t **ptlp)
4007{
4008 int res;
4009
4010
4011 (void) __cond_lock(*ptlp,
4012 !(res = __follow_pte(mm, address, ptepp, ptlp)));
4013 return res;
4014}
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026int follow_pfn(struct vm_area_struct *vma, unsigned long address,
4027 unsigned long *pfn)
4028{
4029 int ret = -EINVAL;
4030 spinlock_t *ptl;
4031 pte_t *ptep;
4032
4033 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
4034 return ret;
4035
4036 ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
4037 if (ret)
4038 return ret;
4039 *pfn = pte_pfn(*ptep);
4040 pte_unmap_unlock(ptep, ptl);
4041 return 0;
4042}
4043EXPORT_SYMBOL(follow_pfn);
4044
4045#ifdef CONFIG_HAVE_IOREMAP_PROT
4046int follow_phys(struct vm_area_struct *vma,
4047 unsigned long address, unsigned int flags,
4048 unsigned long *prot, resource_size_t *phys)
4049{
4050 int ret = -EINVAL;
4051 pte_t *ptep, pte;
4052 spinlock_t *ptl;
4053
4054 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
4055 goto out;
4056
4057 if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
4058 goto out;
4059 pte = *ptep;
4060
4061 if ((flags & FOLL_WRITE) && !pte_write(pte))
4062 goto unlock;
4063
4064 *prot = pgprot_val(pte_pgprot(pte));
4065 *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
4066
4067 ret = 0;
4068unlock:
4069 pte_unmap_unlock(ptep, ptl);
4070out:
4071 return ret;
4072}
4073
4074int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
4075 void *buf, int len, int write)
4076{
4077 resource_size_t phys_addr;
4078 unsigned long prot = 0;
4079 void __iomem *maddr;
4080 int offset = addr & (PAGE_SIZE-1);
4081
4082 if (follow_phys(vma, addr, write, &prot, &phys_addr))
4083 return -EINVAL;
4084
4085 maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot);
4086 if (write)
4087 memcpy_toio(maddr + offset, buf, len);
4088 else
4089 memcpy_fromio(buf, maddr + offset, len);
4090 iounmap(maddr);
4091
4092 return len;
4093}
4094EXPORT_SYMBOL_GPL(generic_access_phys);
4095#endif
4096
4097
4098
4099
4100
4101static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
4102 unsigned long addr, void *buf, int len, int write)
4103{
4104 struct vm_area_struct *vma;
4105 void *old_buf = buf;
4106
4107 down_read(&mm->mmap_sem);
4108
4109 while (len) {
4110 int bytes, ret, offset;
4111 void *maddr;
4112 struct page *page = NULL;
4113
4114 ret = get_user_pages(tsk, mm, addr, 1,
4115 write, 1, &page, &vma);
4116 if (ret <= 0) {
4117
4118
4119
4120
4121#ifdef CONFIG_HAVE_IOREMAP_PROT
4122 vma = find_vma(mm, addr);
4123 if (!vma || vma->vm_start > addr)
4124 break;
4125 if (vma->vm_ops && vma->vm_ops->access)
4126 ret = vma->vm_ops->access(vma, addr, buf,
4127 len, write);
4128 if (ret <= 0)
4129#endif
4130 break;
4131 bytes = ret;
4132 } else {
4133 bytes = len;
4134 offset = addr & (PAGE_SIZE-1);
4135 if (bytes > PAGE_SIZE-offset)
4136 bytes = PAGE_SIZE-offset;
4137
4138 maddr = kmap(page);
4139 if (write) {
4140 copy_to_user_page(vma, page, addr,
4141 maddr + offset, buf, bytes);
4142 set_page_dirty_lock(page);
4143 } else {
4144 copy_from_user_page(vma, page, addr,
4145 buf, maddr + offset, bytes);
4146 }
4147 kunmap(page);
4148 page_cache_release(page);
4149 }
4150 len -= bytes;
4151 buf += bytes;
4152 addr += bytes;
4153 }
4154 up_read(&mm->mmap_sem);
4155
4156 return buf - old_buf;
4157}
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169int access_remote_vm(struct mm_struct *mm, unsigned long addr,
4170 void *buf, int len, int write)
4171{
4172 return __access_remote_vm(NULL, mm, addr, buf, len, write);
4173}
4174
4175
4176
4177
4178
4179
4180int access_process_vm(struct task_struct *tsk, unsigned long addr,
4181 void *buf, int len, int write)
4182{
4183 struct mm_struct *mm;
4184 int ret;
4185
4186 mm = get_task_mm(tsk);
4187 if (!mm)
4188 return 0;
4189
4190 ret = __access_remote_vm(tsk, mm, addr, buf, len, write);
4191 mmput(mm);
4192
4193 return ret;
4194}
4195
4196
4197
4198
4199void print_vma_addr(char *prefix, unsigned long ip)
4200{
4201 struct mm_struct *mm = current->mm;
4202 struct vm_area_struct *vma;
4203
4204
4205
4206
4207
4208 if (preempt_count())
4209 return;
4210
4211 down_read(&mm->mmap_sem);
4212 vma = find_vma(mm, ip);
4213 if (vma && vma->vm_file) {
4214 struct file *f = vma->vm_file;
4215 char *buf = (char *)__get_free_page(GFP_KERNEL);
4216 if (buf) {
4217 char *p;
4218
4219 p = d_path(&f->f_path, buf, PAGE_SIZE);
4220 if (IS_ERR(p))
4221 p = "?";
4222 printk("%s%s[%lx+%lx]", prefix, kbasename(p),
4223 vma->vm_start,
4224 vma->vm_end - vma->vm_start);
4225 free_page((unsigned long)buf);
4226 }
4227 }
4228 up_read(&mm->mmap_sem);
4229}
4230
4231#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
4232void might_fault(void)
4233{
4234
4235
4236
4237
4238
4239
4240 if (segment_eq(get_fs(), KERNEL_DS))
4241 return;
4242
4243
4244
4245
4246
4247
4248 if (in_atomic())
4249 return;
4250
4251 __might_sleep(__FILE__, __LINE__, 0);
4252
4253 if (current->mm)
4254 might_lock_read(¤t->mm->mmap_sem);
4255}
4256EXPORT_SYMBOL(might_fault);
4257#endif
4258
4259#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
4260static void clear_gigantic_page(struct page *page,
4261 unsigned long addr,
4262 unsigned int pages_per_huge_page)
4263{
4264 int i;
4265 struct page *p = page;
4266
4267 might_sleep();
4268 for (i = 0; i < pages_per_huge_page;
4269 i++, p = mem_map_next(p, page, i)) {
4270 cond_resched();
4271 clear_user_highpage(p, addr + i * PAGE_SIZE);
4272 }
4273}
4274void clear_huge_page(struct page *page,
4275 unsigned long addr, unsigned int pages_per_huge_page)
4276{
4277 int i;
4278
4279 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
4280 clear_gigantic_page(page, addr, pages_per_huge_page);
4281 return;
4282 }
4283
4284 might_sleep();
4285 for (i = 0; i < pages_per_huge_page; i++) {
4286 cond_resched();
4287 clear_user_highpage(page + i, addr + i * PAGE_SIZE);
4288 }
4289}
4290
4291static void copy_user_gigantic_page(struct page *dst, struct page *src,
4292 unsigned long addr,
4293 struct vm_area_struct *vma,
4294 unsigned int pages_per_huge_page)
4295{
4296 int i;
4297 struct page *dst_base = dst;
4298 struct page *src_base = src;
4299
4300 for (i = 0; i < pages_per_huge_page; ) {
4301 cond_resched();
4302 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
4303
4304 i++;
4305 dst = mem_map_next(dst, dst_base, i);
4306 src = mem_map_next(src, src_base, i);
4307 }
4308}
4309
4310void copy_user_huge_page(struct page *dst, struct page *src,
4311 unsigned long addr, struct vm_area_struct *vma,
4312 unsigned int pages_per_huge_page)
4313{
4314 int i;
4315
4316 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
4317 copy_user_gigantic_page(dst, src, addr, vma,
4318 pages_per_huge_page);
4319 return;
4320 }
4321
4322 might_sleep();
4323 for (i = 0; i < pages_per_huge_page; i++) {
4324 cond_resched();
4325 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
4326 }
4327}
4328#endif
4329