1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41#include <linux/kernel_stat.h>
42#include <linux/mm.h>
43#include <linux/hugetlb.h>
44#include <linux/mman.h>
45#include <linux/swap.h>
46#include <linux/highmem.h>
47#include <linux/pagemap.h>
48#include <linux/ksm.h>
49#include <linux/rmap.h>
50#include <linux/export.h>
51#include <linux/delayacct.h>
52#include <linux/init.h>
53#include <linux/writeback.h>
54#include <linux/memcontrol.h>
55#include <linux/mmu_notifier.h>
56#include <linux/kallsyms.h>
57#include <linux/swapops.h>
58#include <linux/elf.h>
59#include <linux/gfp.h>
60#include <linux/migrate.h>
61#include <linux/string.h>
62#include <linux/dma-debug.h>
63#include <linux/debugfs.h>
64
65#include <asm/io.h>
66#include <asm/pgalloc.h>
67#include <asm/uaccess.h>
68#include <asm/tlb.h>
69#include <asm/tlbflush.h>
70#include <asm/pgtable.h>
71
72#include "internal.h"
73
74#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
75#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
76#endif
77
78#ifndef CONFIG_NEED_MULTIPLE_NODES
79
80unsigned long max_mapnr;
81struct page *mem_map;
82
83EXPORT_SYMBOL(max_mapnr);
84EXPORT_SYMBOL(mem_map);
85#endif
86
87
88
89
90
91
92
93
94void * high_memory;
95
96EXPORT_SYMBOL(high_memory);
97
98
99
100
101
102
103
104int randomize_va_space __read_mostly =
105#ifdef CONFIG_COMPAT_BRK
106 1;
107#else
108 2;
109#endif
110
111static int __init disable_randmaps(char *s)
112{
113 randomize_va_space = 0;
114 return 1;
115}
116__setup("norandmaps", disable_randmaps);
117
118unsigned long zero_pfn __read_mostly;
119unsigned long highest_memmap_pfn __read_mostly;
120
121EXPORT_SYMBOL(zero_pfn);
122
123
124
125
126static int __init init_zero_pfn(void)
127{
128 zero_pfn = page_to_pfn(ZERO_PAGE(0));
129 return 0;
130}
131core_initcall(init_zero_pfn);
132
133
134#if defined(SPLIT_RSS_COUNTING)
135
136void sync_mm_rss(struct mm_struct *mm)
137{
138 int i;
139
140 for (i = 0; i < NR_MM_COUNTERS; i++) {
141 if (current->rss_stat.count[i]) {
142 add_mm_counter(mm, i, current->rss_stat.count[i]);
143 current->rss_stat.count[i] = 0;
144 }
145 }
146 current->rss_stat.events = 0;
147}
148
149static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
150{
151 struct task_struct *task = current;
152
153 if (likely(task->mm == mm))
154 task->rss_stat.count[member] += val;
155 else
156 add_mm_counter(mm, member, val);
157}
158#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
159#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
160
161
162#define TASK_RSS_EVENTS_THRESH (64)
163static void check_sync_rss_stat(struct task_struct *task)
164{
165 if (unlikely(task != current))
166 return;
167 if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
168 sync_mm_rss(task->mm);
169}
170#else
171
172#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
173#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
174
175static void check_sync_rss_stat(struct task_struct *task)
176{
177}
178
179#endif
180
181#ifdef HAVE_GENERIC_MMU_GATHER
182
183static int tlb_next_batch(struct mmu_gather *tlb)
184{
185 struct mmu_gather_batch *batch;
186
187 batch = tlb->active;
188 if (batch->next) {
189 tlb->active = batch->next;
190 return 1;
191 }
192
193 if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
194 return 0;
195
196 batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
197 if (!batch)
198 return 0;
199
200 tlb->batch_count++;
201 batch->next = NULL;
202 batch->nr = 0;
203 batch->max = MAX_GATHER_BATCH;
204
205 tlb->active->next = batch;
206 tlb->active = batch;
207
208 return 1;
209}
210
211
212
213
214
215
216void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end)
217{
218 tlb->mm = mm;
219
220
221 tlb->fullmm = !(start | (end+1));
222 tlb->need_flush_all = 0;
223 tlb->local.next = NULL;
224 tlb->local.nr = 0;
225 tlb->local.max = ARRAY_SIZE(tlb->__pages);
226 tlb->active = &tlb->local;
227 tlb->batch_count = 0;
228
229#ifdef CONFIG_HAVE_RCU_TABLE_FREE
230 tlb->batch = NULL;
231#endif
232
233 __tlb_reset_range(tlb);
234}
235
236static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
237{
238 if (!tlb->end)
239 return;
240
241 tlb_flush(tlb);
242 mmu_notifier_invalidate_range(tlb->mm, tlb->start, tlb->end);
243#ifdef CONFIG_HAVE_RCU_TABLE_FREE
244 tlb_table_flush(tlb);
245#endif
246 __tlb_reset_range(tlb);
247}
248
249static void tlb_flush_mmu_free(struct mmu_gather *tlb)
250{
251 struct mmu_gather_batch *batch;
252
253 for (batch = &tlb->local; batch && batch->nr; batch = batch->next) {
254 free_pages_and_swap_cache(batch->pages, batch->nr);
255 batch->nr = 0;
256 }
257 tlb->active = &tlb->local;
258}
259
260void tlb_flush_mmu(struct mmu_gather *tlb)
261{
262 tlb_flush_mmu_tlbonly(tlb);
263 tlb_flush_mmu_free(tlb);
264}
265
266
267
268
269
270void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
271{
272 struct mmu_gather_batch *batch, *next;
273
274 tlb_flush_mmu(tlb);
275
276
277 check_pgt_cache();
278
279 for (batch = tlb->local.next; batch; batch = next) {
280 next = batch->next;
281 free_pages((unsigned long)batch, 0);
282 }
283 tlb->local.next = NULL;
284}
285
286
287
288
289
290
291
292int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
293{
294 struct mmu_gather_batch *batch;
295
296 VM_BUG_ON(!tlb->end);
297
298 batch = tlb->active;
299 batch->pages[batch->nr++] = page;
300 if (batch->nr == batch->max) {
301 if (!tlb_next_batch(tlb))
302 return 0;
303 batch = tlb->active;
304 }
305 VM_BUG_ON_PAGE(batch->nr > batch->max, page);
306
307 return batch->max - batch->nr;
308}
309
310#endif
311
312#ifdef CONFIG_HAVE_RCU_TABLE_FREE
313
314
315
316
317
318static void tlb_remove_table_smp_sync(void *arg)
319{
320
321}
322
323static void tlb_remove_table_one(void *table)
324{
325
326
327
328
329
330
331
332 smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
333 __tlb_remove_table(table);
334}
335
336static void tlb_remove_table_rcu(struct rcu_head *head)
337{
338 struct mmu_table_batch *batch;
339 int i;
340
341 batch = container_of(head, struct mmu_table_batch, rcu);
342
343 for (i = 0; i < batch->nr; i++)
344 __tlb_remove_table(batch->tables[i]);
345
346 free_page((unsigned long)batch);
347}
348
349void tlb_table_flush(struct mmu_gather *tlb)
350{
351 struct mmu_table_batch **batch = &tlb->batch;
352
353 if (*batch) {
354 call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
355 *batch = NULL;
356 }
357}
358
359void tlb_remove_table(struct mmu_gather *tlb, void *table)
360{
361 struct mmu_table_batch **batch = &tlb->batch;
362
363
364
365
366
367 if (atomic_read(&tlb->mm->mm_users) < 2) {
368 __tlb_remove_table(table);
369 return;
370 }
371
372 if (*batch == NULL) {
373 *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
374 if (*batch == NULL) {
375 tlb_remove_table_one(table);
376 return;
377 }
378 (*batch)->nr = 0;
379 }
380 (*batch)->tables[(*batch)->nr++] = table;
381 if ((*batch)->nr == MAX_TABLE_BATCH)
382 tlb_table_flush(tlb);
383}
384
385#endif
386
387
388
389
390
391static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
392 unsigned long addr)
393{
394 pgtable_t token = pmd_pgtable(*pmd);
395 pmd_clear(pmd);
396 pte_free_tlb(tlb, token, addr);
397 atomic_long_dec(&tlb->mm->nr_ptes);
398}
399
400static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
401 unsigned long addr, unsigned long end,
402 unsigned long floor, unsigned long ceiling)
403{
404 pmd_t *pmd;
405 unsigned long next;
406 unsigned long start;
407
408 start = addr;
409 pmd = pmd_offset(pud, addr);
410 do {
411 next = pmd_addr_end(addr, end);
412 if (pmd_none_or_clear_bad(pmd))
413 continue;
414 free_pte_range(tlb, pmd, addr);
415 } while (pmd++, addr = next, addr != end);
416
417 start &= PUD_MASK;
418 if (start < floor)
419 return;
420 if (ceiling) {
421 ceiling &= PUD_MASK;
422 if (!ceiling)
423 return;
424 }
425 if (end - 1 > ceiling - 1)
426 return;
427
428 pmd = pmd_offset(pud, start);
429 pud_clear(pud);
430 pmd_free_tlb(tlb, pmd, start);
431 mm_dec_nr_pmds(tlb->mm);
432}
433
434static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
435 unsigned long addr, unsigned long end,
436 unsigned long floor, unsigned long ceiling)
437{
438 pud_t *pud;
439 unsigned long next;
440 unsigned long start;
441
442 start = addr;
443 pud = pud_offset(pgd, addr);
444 do {
445 next = pud_addr_end(addr, end);
446 if (pud_none_or_clear_bad(pud))
447 continue;
448 free_pmd_range(tlb, pud, addr, next, floor, ceiling);
449 } while (pud++, addr = next, addr != end);
450
451 start &= PGDIR_MASK;
452 if (start < floor)
453 return;
454 if (ceiling) {
455 ceiling &= PGDIR_MASK;
456 if (!ceiling)
457 return;
458 }
459 if (end - 1 > ceiling - 1)
460 return;
461
462 pud = pud_offset(pgd, start);
463 pgd_clear(pgd);
464 pud_free_tlb(tlb, pud, start);
465}
466
467
468
469
470void free_pgd_range(struct mmu_gather *tlb,
471 unsigned long addr, unsigned long end,
472 unsigned long floor, unsigned long ceiling)
473{
474 pgd_t *pgd;
475 unsigned long next;
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503 addr &= PMD_MASK;
504 if (addr < floor) {
505 addr += PMD_SIZE;
506 if (!addr)
507 return;
508 }
509 if (ceiling) {
510 ceiling &= PMD_MASK;
511 if (!ceiling)
512 return;
513 }
514 if (end - 1 > ceiling - 1)
515 end -= PMD_SIZE;
516 if (addr > end - 1)
517 return;
518
519 pgd = pgd_offset(tlb->mm, addr);
520 do {
521 next = pgd_addr_end(addr, end);
522 if (pgd_none_or_clear_bad(pgd))
523 continue;
524 free_pud_range(tlb, pgd, addr, next, floor, ceiling);
525 } while (pgd++, addr = next, addr != end);
526}
527
528void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
529 unsigned long floor, unsigned long ceiling)
530{
531 while (vma) {
532 struct vm_area_struct *next = vma->vm_next;
533 unsigned long addr = vma->vm_start;
534
535
536
537
538
539 unlink_anon_vmas(vma);
540 unlink_file_vma(vma);
541
542 if (is_vm_hugetlb_page(vma)) {
543 hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
544 floor, next? next->vm_start: ceiling);
545 } else {
546
547
548
549 while (next && next->vm_start <= vma->vm_end + PMD_SIZE
550 && !is_vm_hugetlb_page(next)) {
551 vma = next;
552 next = vma->vm_next;
553 unlink_anon_vmas(vma);
554 unlink_file_vma(vma);
555 }
556 free_pgd_range(tlb, addr, vma->vm_end,
557 floor, next? next->vm_start: ceiling);
558 }
559 vma = next;
560 }
561}
562
563int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
564 pmd_t *pmd, unsigned long address)
565{
566 spinlock_t *ptl;
567 pgtable_t new = pte_alloc_one(mm, address);
568 int wait_split_huge_page;
569 if (!new)
570 return -ENOMEM;
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585 smp_wmb();
586
587 ptl = pmd_lock(mm, pmd);
588 wait_split_huge_page = 0;
589 if (likely(pmd_none(*pmd))) {
590 atomic_long_inc(&mm->nr_ptes);
591 pmd_populate(mm, pmd, new);
592 new = NULL;
593 } else if (unlikely(pmd_trans_splitting(*pmd)))
594 wait_split_huge_page = 1;
595 spin_unlock(ptl);
596 if (new)
597 pte_free(mm, new);
598 if (wait_split_huge_page)
599 wait_split_huge_page(vma->anon_vma, pmd);
600 return 0;
601}
602
603int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
604{
605 pte_t *new = pte_alloc_one_kernel(&init_mm, address);
606 if (!new)
607 return -ENOMEM;
608
609 smp_wmb();
610
611 spin_lock(&init_mm.page_table_lock);
612 if (likely(pmd_none(*pmd))) {
613 pmd_populate_kernel(&init_mm, pmd, new);
614 new = NULL;
615 } else
616 VM_BUG_ON(pmd_trans_splitting(*pmd));
617 spin_unlock(&init_mm.page_table_lock);
618 if (new)
619 pte_free_kernel(&init_mm, new);
620 return 0;
621}
622
623static inline void init_rss_vec(int *rss)
624{
625 memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
626}
627
628static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
629{
630 int i;
631
632 if (current->mm == mm)
633 sync_mm_rss(mm);
634 for (i = 0; i < NR_MM_COUNTERS; i++)
635 if (rss[i])
636 add_mm_counter(mm, i, rss[i]);
637}
638
639
640
641
642
643
644
645
646static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
647 pte_t pte, struct page *page)
648{
649 pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
650 pud_t *pud = pud_offset(pgd, addr);
651 pmd_t *pmd = pmd_offset(pud, addr);
652 struct address_space *mapping;
653 pgoff_t index;
654 static unsigned long resume;
655 static unsigned long nr_shown;
656 static unsigned long nr_unshown;
657
658
659
660
661
662 if (nr_shown == 60) {
663 if (time_before(jiffies, resume)) {
664 nr_unshown++;
665 return;
666 }
667 if (nr_unshown) {
668 printk(KERN_ALERT
669 "BUG: Bad page map: %lu messages suppressed\n",
670 nr_unshown);
671 nr_unshown = 0;
672 }
673 nr_shown = 0;
674 }
675 if (nr_shown++ == 0)
676 resume = jiffies + 60 * HZ;
677
678 mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
679 index = linear_page_index(vma, addr);
680
681 printk(KERN_ALERT
682 "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
683 current->comm,
684 (long long)pte_val(pte), (long long)pmd_val(*pmd));
685 if (page)
686 dump_page(page, "bad pte");
687 printk(KERN_ALERT
688 "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
689 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
690
691
692
693 pr_alert("file:%pD fault:%pf mmap:%pf readpage:%pf\n",
694 vma->vm_file,
695 vma->vm_ops ? vma->vm_ops->fault : NULL,
696 vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
697 mapping ? mapping->a_ops->readpage : NULL);
698 dump_stack();
699 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
700}
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744#ifdef __HAVE_ARCH_PTE_SPECIAL
745# define HAVE_PTE_SPECIAL 1
746#else
747# define HAVE_PTE_SPECIAL 0
748#endif
749struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
750 pte_t pte)
751{
752 unsigned long pfn = pte_pfn(pte);
753
754 if (HAVE_PTE_SPECIAL) {
755 if (likely(!pte_special(pte)))
756 goto check_pfn;
757 if (vma->vm_ops && vma->vm_ops->find_special_page)
758 return vma->vm_ops->find_special_page(vma, addr);
759 if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
760 return NULL;
761 if (!is_zero_pfn(pfn))
762 print_bad_pte(vma, addr, pte, NULL);
763 return NULL;
764 }
765
766
767
768 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
769 if (vma->vm_flags & VM_MIXEDMAP) {
770 if (!pfn_valid(pfn))
771 return NULL;
772 goto out;
773 } else {
774 unsigned long off;
775 off = (addr - vma->vm_start) >> PAGE_SHIFT;
776 if (pfn == vma->vm_pgoff + off)
777 return NULL;
778 if (!is_cow_mapping(vma->vm_flags))
779 return NULL;
780 }
781 }
782
783 if (is_zero_pfn(pfn))
784 return NULL;
785check_pfn:
786 if (unlikely(pfn > highest_memmap_pfn)) {
787 print_bad_pte(vma, addr, pte, NULL);
788 return NULL;
789 }
790
791
792
793
794
795out:
796 return pfn_to_page(pfn);
797}
798
799
800
801
802
803
804
805static inline unsigned long
806copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
807 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
808 unsigned long addr, int *rss)
809{
810 unsigned long vm_flags = vma->vm_flags;
811 pte_t pte = *src_pte;
812 struct page *page;
813
814
815 if (unlikely(!pte_present(pte))) {
816 swp_entry_t entry = pte_to_swp_entry(pte);
817
818 if (likely(!non_swap_entry(entry))) {
819 if (swap_duplicate(entry) < 0)
820 return entry.val;
821
822
823 if (unlikely(list_empty(&dst_mm->mmlist))) {
824 spin_lock(&mmlist_lock);
825 if (list_empty(&dst_mm->mmlist))
826 list_add(&dst_mm->mmlist,
827 &src_mm->mmlist);
828 spin_unlock(&mmlist_lock);
829 }
830 rss[MM_SWAPENTS]++;
831 } else if (is_migration_entry(entry)) {
832 page = migration_entry_to_page(entry);
833
834 if (PageAnon(page))
835 rss[MM_ANONPAGES]++;
836 else
837 rss[MM_FILEPAGES]++;
838
839 if (is_write_migration_entry(entry) &&
840 is_cow_mapping(vm_flags)) {
841
842
843
844
845 make_migration_entry_read(&entry);
846 pte = swp_entry_to_pte(entry);
847 if (pte_swp_soft_dirty(*src_pte))
848 pte = pte_swp_mksoft_dirty(pte);
849 set_pte_at(src_mm, addr, src_pte, pte);
850 }
851 }
852 goto out_set_pte;
853 }
854
855
856
857
858
859 if (is_cow_mapping(vm_flags)) {
860 ptep_set_wrprotect(src_mm, addr, src_pte);
861 pte = pte_wrprotect(pte);
862 }
863
864
865
866
867
868 if (vm_flags & VM_SHARED)
869 pte = pte_mkclean(pte);
870 pte = pte_mkold(pte);
871
872 page = vm_normal_page(vma, addr, pte);
873 if (page) {
874 get_page(page);
875 page_dup_rmap(page);
876 if (PageAnon(page))
877 rss[MM_ANONPAGES]++;
878 else
879 rss[MM_FILEPAGES]++;
880 }
881
882out_set_pte:
883 set_pte_at(dst_mm, addr, dst_pte, pte);
884 return 0;
885}
886
887static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
888 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
889 unsigned long addr, unsigned long end)
890{
891 pte_t *orig_src_pte, *orig_dst_pte;
892 pte_t *src_pte, *dst_pte;
893 spinlock_t *src_ptl, *dst_ptl;
894 int progress = 0;
895 int rss[NR_MM_COUNTERS];
896 swp_entry_t entry = (swp_entry_t){0};
897
898again:
899 init_rss_vec(rss);
900
901 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
902 if (!dst_pte)
903 return -ENOMEM;
904 src_pte = pte_offset_map(src_pmd, addr);
905 src_ptl = pte_lockptr(src_mm, src_pmd);
906 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
907 orig_src_pte = src_pte;
908 orig_dst_pte = dst_pte;
909 arch_enter_lazy_mmu_mode();
910
911 do {
912
913
914
915
916 if (progress >= 32) {
917 progress = 0;
918 if (need_resched() ||
919 spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
920 break;
921 }
922 if (pte_none(*src_pte)) {
923 progress++;
924 continue;
925 }
926 entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
927 vma, addr, rss);
928 if (entry.val)
929 break;
930 progress += 8;
931 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
932
933 arch_leave_lazy_mmu_mode();
934 spin_unlock(src_ptl);
935 pte_unmap(orig_src_pte);
936 add_mm_rss_vec(dst_mm, rss);
937 pte_unmap_unlock(orig_dst_pte, dst_ptl);
938 cond_resched();
939
940 if (entry.val) {
941 if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
942 return -ENOMEM;
943 progress = 0;
944 }
945 if (addr != end)
946 goto again;
947 return 0;
948}
949
950static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
951 pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
952 unsigned long addr, unsigned long end)
953{
954 pmd_t *src_pmd, *dst_pmd;
955 unsigned long next;
956
957 dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
958 if (!dst_pmd)
959 return -ENOMEM;
960 src_pmd = pmd_offset(src_pud, addr);
961 do {
962 next = pmd_addr_end(addr, end);
963 if (pmd_trans_huge(*src_pmd)) {
964 int err;
965 VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);
966 err = copy_huge_pmd(dst_mm, src_mm,
967 dst_pmd, src_pmd, addr, vma);
968 if (err == -ENOMEM)
969 return -ENOMEM;
970 if (!err)
971 continue;
972
973 }
974 if (pmd_none_or_clear_bad(src_pmd))
975 continue;
976 if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
977 vma, addr, next))
978 return -ENOMEM;
979 } while (dst_pmd++, src_pmd++, addr = next, addr != end);
980 return 0;
981}
982
983static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
984 pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
985 unsigned long addr, unsigned long end)
986{
987 pud_t *src_pud, *dst_pud;
988 unsigned long next;
989
990 dst_pud = pud_alloc(dst_mm, dst_pgd, addr);
991 if (!dst_pud)
992 return -ENOMEM;
993 src_pud = pud_offset(src_pgd, addr);
994 do {
995 next = pud_addr_end(addr, end);
996 if (pud_none_or_clear_bad(src_pud))
997 continue;
998 if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
999 vma, addr, next))
1000 return -ENOMEM;
1001 } while (dst_pud++, src_pud++, addr = next, addr != end);
1002 return 0;
1003}
1004
1005int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1006 struct vm_area_struct *vma)
1007{
1008 pgd_t *src_pgd, *dst_pgd;
1009 unsigned long next;
1010 unsigned long addr = vma->vm_start;
1011 unsigned long end = vma->vm_end;
1012 unsigned long mmun_start;
1013 unsigned long mmun_end;
1014 bool is_cow;
1015 int ret;
1016
1017
1018
1019
1020
1021
1022
1023 if (!(vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) &&
1024 !vma->anon_vma)
1025 return 0;
1026
1027 if (is_vm_hugetlb_page(vma))
1028 return copy_hugetlb_page_range(dst_mm, src_mm, vma);
1029
1030 if (unlikely(vma->vm_flags & VM_PFNMAP)) {
1031
1032
1033
1034
1035 ret = track_pfn_copy(vma);
1036 if (ret)
1037 return ret;
1038 }
1039
1040
1041
1042
1043
1044
1045
1046 is_cow = is_cow_mapping(vma->vm_flags);
1047 mmun_start = addr;
1048 mmun_end = end;
1049 if (is_cow)
1050 mmu_notifier_invalidate_range_start(src_mm, mmun_start,
1051 mmun_end);
1052
1053 ret = 0;
1054 dst_pgd = pgd_offset(dst_mm, addr);
1055 src_pgd = pgd_offset(src_mm, addr);
1056 do {
1057 next = pgd_addr_end(addr, end);
1058 if (pgd_none_or_clear_bad(src_pgd))
1059 continue;
1060 if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
1061 vma, addr, next))) {
1062 ret = -ENOMEM;
1063 break;
1064 }
1065 } while (dst_pgd++, src_pgd++, addr = next, addr != end);
1066
1067 if (is_cow)
1068 mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end);
1069 return ret;
1070}
1071
1072static unsigned long zap_pte_range(struct mmu_gather *tlb,
1073 struct vm_area_struct *vma, pmd_t *pmd,
1074 unsigned long addr, unsigned long end,
1075 struct zap_details *details)
1076{
1077 struct mm_struct *mm = tlb->mm;
1078 int force_flush = 0;
1079 int rss[NR_MM_COUNTERS];
1080 spinlock_t *ptl;
1081 pte_t *start_pte;
1082 pte_t *pte;
1083 swp_entry_t entry;
1084
1085again:
1086 init_rss_vec(rss);
1087 start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
1088 pte = start_pte;
1089 arch_enter_lazy_mmu_mode();
1090 do {
1091 pte_t ptent = *pte;
1092 if (pte_none(ptent)) {
1093 continue;
1094 }
1095
1096 if (pte_present(ptent)) {
1097 struct page *page;
1098
1099 page = vm_normal_page(vma, addr, ptent);
1100 if (unlikely(details) && page) {
1101
1102
1103
1104
1105
1106 if (details->check_mapping &&
1107 details->check_mapping != page->mapping)
1108 continue;
1109 }
1110 ptent = ptep_get_and_clear_full(mm, addr, pte,
1111 tlb->fullmm);
1112 tlb_remove_tlb_entry(tlb, pte, addr);
1113 if (unlikely(!page))
1114 continue;
1115 if (PageAnon(page))
1116 rss[MM_ANONPAGES]--;
1117 else {
1118 if (pte_dirty(ptent)) {
1119 force_flush = 1;
1120 set_page_dirty(page);
1121 }
1122 if (pte_young(ptent) &&
1123 likely(!(vma->vm_flags & VM_SEQ_READ)))
1124 mark_page_accessed(page);
1125 rss[MM_FILEPAGES]--;
1126 }
1127 page_remove_rmap(page);
1128 if (unlikely(page_mapcount(page) < 0))
1129 print_bad_pte(vma, addr, ptent, page);
1130 if (unlikely(!__tlb_remove_page(tlb, page))) {
1131 force_flush = 1;
1132 addr += PAGE_SIZE;
1133 break;
1134 }
1135 continue;
1136 }
1137
1138 if (unlikely(details))
1139 continue;
1140
1141 entry = pte_to_swp_entry(ptent);
1142 if (!non_swap_entry(entry))
1143 rss[MM_SWAPENTS]--;
1144 else if (is_migration_entry(entry)) {
1145 struct page *page;
1146
1147 page = migration_entry_to_page(entry);
1148
1149 if (PageAnon(page))
1150 rss[MM_ANONPAGES]--;
1151 else
1152 rss[MM_FILEPAGES]--;
1153 }
1154 if (unlikely(!free_swap_and_cache(entry)))
1155 print_bad_pte(vma, addr, ptent, NULL);
1156 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
1157 } while (pte++, addr += PAGE_SIZE, addr != end);
1158
1159 add_mm_rss_vec(mm, rss);
1160 arch_leave_lazy_mmu_mode();
1161
1162
1163 if (force_flush)
1164 tlb_flush_mmu_tlbonly(tlb);
1165 pte_unmap_unlock(start_pte, ptl);
1166
1167
1168
1169
1170
1171
1172
1173 if (force_flush) {
1174 force_flush = 0;
1175 tlb_flush_mmu_free(tlb);
1176
1177 if (addr != end)
1178 goto again;
1179 }
1180
1181 return addr;
1182}
1183
1184static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1185 struct vm_area_struct *vma, pud_t *pud,
1186 unsigned long addr, unsigned long end,
1187 struct zap_details *details)
1188{
1189 pmd_t *pmd;
1190 unsigned long next;
1191
1192 pmd = pmd_offset(pud, addr);
1193 do {
1194 next = pmd_addr_end(addr, end);
1195 if (pmd_trans_huge(*pmd)) {
1196 if (next - addr != HPAGE_PMD_SIZE) {
1197#ifdef CONFIG_DEBUG_VM
1198 if (!rwsem_is_locked(&tlb->mm->mmap_sem)) {
1199 pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n",
1200 __func__, addr, end,
1201 vma->vm_start,
1202 vma->vm_end);
1203 BUG();
1204 }
1205#endif
1206 split_huge_page_pmd(vma, addr, pmd);
1207 } else if (zap_huge_pmd(tlb, vma, pmd, addr))
1208 goto next;
1209
1210 }
1211
1212
1213
1214
1215
1216
1217
1218 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
1219 goto next;
1220 next = zap_pte_range(tlb, vma, pmd, addr, next, details);
1221next:
1222 cond_resched();
1223 } while (pmd++, addr = next, addr != end);
1224
1225 return addr;
1226}
1227
1228static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1229 struct vm_area_struct *vma, pgd_t *pgd,
1230 unsigned long addr, unsigned long end,
1231 struct zap_details *details)
1232{
1233 pud_t *pud;
1234 unsigned long next;
1235
1236 pud = pud_offset(pgd, addr);
1237 do {
1238 next = pud_addr_end(addr, end);
1239 if (pud_none_or_clear_bad(pud))
1240 continue;
1241 next = zap_pmd_range(tlb, vma, pud, addr, next, details);
1242 } while (pud++, addr = next, addr != end);
1243
1244 return addr;
1245}
1246
1247static void unmap_page_range(struct mmu_gather *tlb,
1248 struct vm_area_struct *vma,
1249 unsigned long addr, unsigned long end,
1250 struct zap_details *details)
1251{
1252 pgd_t *pgd;
1253 unsigned long next;
1254
1255 if (details && !details->check_mapping)
1256 details = NULL;
1257
1258 BUG_ON(addr >= end);
1259 tlb_start_vma(tlb, vma);
1260 pgd = pgd_offset(vma->vm_mm, addr);
1261 do {
1262 next = pgd_addr_end(addr, end);
1263 if (pgd_none_or_clear_bad(pgd))
1264 continue;
1265 next = zap_pud_range(tlb, vma, pgd, addr, next, details);
1266 } while (pgd++, addr = next, addr != end);
1267 tlb_end_vma(tlb, vma);
1268}
1269
1270
1271static void unmap_single_vma(struct mmu_gather *tlb,
1272 struct vm_area_struct *vma, unsigned long start_addr,
1273 unsigned long end_addr,
1274 struct zap_details *details)
1275{
1276 unsigned long start = max(vma->vm_start, start_addr);
1277 unsigned long end;
1278
1279 if (start >= vma->vm_end)
1280 return;
1281 end = min(vma->vm_end, end_addr);
1282 if (end <= vma->vm_start)
1283 return;
1284
1285 if (vma->vm_file)
1286 uprobe_munmap(vma, start, end);
1287
1288 if (unlikely(vma->vm_flags & VM_PFNMAP))
1289 untrack_pfn(vma, 0, 0);
1290
1291 if (start != end) {
1292 if (unlikely(is_vm_hugetlb_page(vma))) {
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304 if (vma->vm_file) {
1305 i_mmap_lock_write(vma->vm_file->f_mapping);
1306 __unmap_hugepage_range_final(tlb, vma, start, end, NULL);
1307 i_mmap_unlock_write(vma->vm_file->f_mapping);
1308 }
1309 } else
1310 unmap_page_range(tlb, vma, start, end, details);
1311 }
1312}
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332void unmap_vmas(struct mmu_gather *tlb,
1333 struct vm_area_struct *vma, unsigned long start_addr,
1334 unsigned long end_addr)
1335{
1336 struct mm_struct *mm = vma->vm_mm;
1337
1338 mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
1339 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
1340 unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
1341 mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
1342}
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353void zap_page_range(struct vm_area_struct *vma, unsigned long start,
1354 unsigned long size, struct zap_details *details)
1355{
1356 struct mm_struct *mm = vma->vm_mm;
1357 struct mmu_gather tlb;
1358 unsigned long end = start + size;
1359
1360 lru_add_drain();
1361 tlb_gather_mmu(&tlb, mm, start, end);
1362 update_hiwater_rss(mm);
1363 mmu_notifier_invalidate_range_start(mm, start, end);
1364 for ( ; vma && vma->vm_start < end; vma = vma->vm_next)
1365 unmap_single_vma(&tlb, vma, start, end, details);
1366 mmu_notifier_invalidate_range_end(mm, start, end);
1367 tlb_finish_mmu(&tlb, start, end);
1368}
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
1380 unsigned long size, struct zap_details *details)
1381{
1382 struct mm_struct *mm = vma->vm_mm;
1383 struct mmu_gather tlb;
1384 unsigned long end = address + size;
1385
1386 lru_add_drain();
1387 tlb_gather_mmu(&tlb, mm, address, end);
1388 update_hiwater_rss(mm);
1389 mmu_notifier_invalidate_range_start(mm, address, end);
1390 unmap_single_vma(&tlb, vma, address, end, details);
1391 mmu_notifier_invalidate_range_end(mm, address, end);
1392 tlb_finish_mmu(&tlb, address, end);
1393}
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
1408 unsigned long size)
1409{
1410 if (address < vma->vm_start || address + size > vma->vm_end ||
1411 !(vma->vm_flags & VM_PFNMAP))
1412 return -1;
1413 zap_page_range_single(vma, address, size, NULL);
1414 return 0;
1415}
1416EXPORT_SYMBOL_GPL(zap_vma_ptes);
1417
1418pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
1419 spinlock_t **ptl)
1420{
1421 pgd_t * pgd = pgd_offset(mm, addr);
1422 pud_t * pud = pud_alloc(mm, pgd, addr);
1423 if (pud) {
1424 pmd_t * pmd = pmd_alloc(mm, pud, addr);
1425 if (pmd) {
1426 VM_BUG_ON(pmd_trans_huge(*pmd));
1427 return pte_alloc_map_lock(mm, pmd, addr, ptl);
1428 }
1429 }
1430 return NULL;
1431}
1432
1433
1434
1435
1436
1437
1438
1439
1440static int insert_page(struct vm_area_struct *vma, unsigned long addr,
1441 struct page *page, pgprot_t prot)
1442{
1443 struct mm_struct *mm = vma->vm_mm;
1444 int retval;
1445 pte_t *pte;
1446 spinlock_t *ptl;
1447
1448 retval = -EINVAL;
1449 if (PageAnon(page))
1450 goto out;
1451 retval = -ENOMEM;
1452 flush_dcache_page(page);
1453 pte = get_locked_pte(mm, addr, &ptl);
1454 if (!pte)
1455 goto out;
1456 retval = -EBUSY;
1457 if (!pte_none(*pte))
1458 goto out_unlock;
1459
1460
1461 get_page(page);
1462 inc_mm_counter_fast(mm, MM_FILEPAGES);
1463 page_add_file_rmap(page);
1464 set_pte_at(mm, addr, pte, mk_pte(page, prot));
1465
1466 retval = 0;
1467 pte_unmap_unlock(pte, ptl);
1468 return retval;
1469out_unlock:
1470 pte_unmap_unlock(pte, ptl);
1471out:
1472 return retval;
1473}
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
1503 struct page *page)
1504{
1505 if (addr < vma->vm_start || addr >= vma->vm_end)
1506 return -EFAULT;
1507 if (!page_count(page))
1508 return -EINVAL;
1509 if (!(vma->vm_flags & VM_MIXEDMAP)) {
1510 BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem));
1511 BUG_ON(vma->vm_flags & VM_PFNMAP);
1512 vma->vm_flags |= VM_MIXEDMAP;
1513 }
1514 return insert_page(vma, addr, page, vma->vm_page_prot);
1515}
1516EXPORT_SYMBOL(vm_insert_page);
1517
1518static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1519 unsigned long pfn, pgprot_t prot)
1520{
1521 struct mm_struct *mm = vma->vm_mm;
1522 int retval;
1523 pte_t *pte, entry;
1524 spinlock_t *ptl;
1525
1526 retval = -ENOMEM;
1527 pte = get_locked_pte(mm, addr, &ptl);
1528 if (!pte)
1529 goto out;
1530 retval = -EBUSY;
1531 if (!pte_none(*pte))
1532 goto out_unlock;
1533
1534
1535 entry = pte_mkspecial(pfn_pte(pfn, prot));
1536 set_pte_at(mm, addr, pte, entry);
1537 update_mmu_cache(vma, addr, pte);
1538
1539 retval = 0;
1540out_unlock:
1541 pte_unmap_unlock(pte, ptl);
1542out:
1543 return retval;
1544}
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1564 unsigned long pfn)
1565{
1566 int ret;
1567 pgprot_t pgprot = vma->vm_page_prot;
1568
1569
1570
1571
1572
1573
1574 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
1575 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
1576 (VM_PFNMAP|VM_MIXEDMAP));
1577 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
1578 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
1579
1580 if (addr < vma->vm_start || addr >= vma->vm_end)
1581 return -EFAULT;
1582 if (track_pfn_insert(vma, &pgprot, pfn))
1583 return -EINVAL;
1584
1585 ret = insert_pfn(vma, addr, pfn, pgprot);
1586
1587 return ret;
1588}
1589EXPORT_SYMBOL(vm_insert_pfn);
1590
1591int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
1592 unsigned long pfn)
1593{
1594 BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
1595
1596 if (addr < vma->vm_start || addr >= vma->vm_end)
1597 return -EFAULT;
1598
1599
1600
1601
1602
1603
1604
1605
1606 if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) {
1607 struct page *page;
1608
1609 page = pfn_to_page(pfn);
1610 return insert_page(vma, addr, page, vma->vm_page_prot);
1611 }
1612 return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
1613}
1614EXPORT_SYMBOL(vm_insert_mixed);
1615
1616
1617
1618
1619
1620
1621static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1622 unsigned long addr, unsigned long end,
1623 unsigned long pfn, pgprot_t prot)
1624{
1625 pte_t *pte;
1626 spinlock_t *ptl;
1627
1628 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
1629 if (!pte)
1630 return -ENOMEM;
1631 arch_enter_lazy_mmu_mode();
1632 do {
1633 BUG_ON(!pte_none(*pte));
1634 set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
1635 pfn++;
1636 } while (pte++, addr += PAGE_SIZE, addr != end);
1637 arch_leave_lazy_mmu_mode();
1638 pte_unmap_unlock(pte - 1, ptl);
1639 return 0;
1640}
1641
1642static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
1643 unsigned long addr, unsigned long end,
1644 unsigned long pfn, pgprot_t prot)
1645{
1646 pmd_t *pmd;
1647 unsigned long next;
1648
1649 pfn -= addr >> PAGE_SHIFT;
1650 pmd = pmd_alloc(mm, pud, addr);
1651 if (!pmd)
1652 return -ENOMEM;
1653 VM_BUG_ON(pmd_trans_huge(*pmd));
1654 do {
1655 next = pmd_addr_end(addr, end);
1656 if (remap_pte_range(mm, pmd, addr, next,
1657 pfn + (addr >> PAGE_SHIFT), prot))
1658 return -ENOMEM;
1659 } while (pmd++, addr = next, addr != end);
1660 return 0;
1661}
1662
1663static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
1664 unsigned long addr, unsigned long end,
1665 unsigned long pfn, pgprot_t prot)
1666{
1667 pud_t *pud;
1668 unsigned long next;
1669
1670 pfn -= addr >> PAGE_SHIFT;
1671 pud = pud_alloc(mm, pgd, addr);
1672 if (!pud)
1673 return -ENOMEM;
1674 do {
1675 next = pud_addr_end(addr, end);
1676 if (remap_pmd_range(mm, pud, addr, next,
1677 pfn + (addr >> PAGE_SHIFT), prot))
1678 return -ENOMEM;
1679 } while (pud++, addr = next, addr != end);
1680 return 0;
1681}
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1694 unsigned long pfn, unsigned long size, pgprot_t prot)
1695{
1696 pgd_t *pgd;
1697 unsigned long next;
1698 unsigned long end = addr + PAGE_ALIGN(size);
1699 struct mm_struct *mm = vma->vm_mm;
1700 int err;
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720 if (is_cow_mapping(vma->vm_flags)) {
1721 if (addr != vma->vm_start || end != vma->vm_end)
1722 return -EINVAL;
1723 vma->vm_pgoff = pfn;
1724 }
1725
1726 err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));
1727 if (err)
1728 return -EINVAL;
1729
1730 vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
1731
1732 BUG_ON(addr >= end);
1733 pfn -= addr >> PAGE_SHIFT;
1734 pgd = pgd_offset(mm, addr);
1735 flush_cache_range(vma, addr, end);
1736 do {
1737 next = pgd_addr_end(addr, end);
1738 err = remap_pud_range(mm, pgd, addr, next,
1739 pfn + (addr >> PAGE_SHIFT), prot);
1740 if (err)
1741 break;
1742 } while (pgd++, addr = next, addr != end);
1743
1744 if (err)
1745 untrack_pfn(vma, pfn, PAGE_ALIGN(size));
1746
1747 return err;
1748}
1749EXPORT_SYMBOL(remap_pfn_range);
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
1765{
1766 unsigned long vm_len, pfn, pages;
1767
1768
1769 if (start + len < start)
1770 return -EINVAL;
1771
1772
1773
1774
1775
1776 len += start & ~PAGE_MASK;
1777 pfn = start >> PAGE_SHIFT;
1778 pages = (len + ~PAGE_MASK) >> PAGE_SHIFT;
1779 if (pfn + pages < pfn)
1780 return -EINVAL;
1781
1782
1783 if (vma->vm_pgoff > pages)
1784 return -EINVAL;
1785 pfn += vma->vm_pgoff;
1786 pages -= vma->vm_pgoff;
1787
1788
1789 vm_len = vma->vm_end - vma->vm_start;
1790 if (vm_len >> PAGE_SHIFT > pages)
1791 return -EINVAL;
1792
1793
1794 return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
1795}
1796EXPORT_SYMBOL(vm_iomap_memory);
1797
1798static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
1799 unsigned long addr, unsigned long end,
1800 pte_fn_t fn, void *data)
1801{
1802 pte_t *pte;
1803 int err;
1804 pgtable_t token;
1805 spinlock_t *uninitialized_var(ptl);
1806
1807 pte = (mm == &init_mm) ?
1808 pte_alloc_kernel(pmd, addr) :
1809 pte_alloc_map_lock(mm, pmd, addr, &ptl);
1810 if (!pte)
1811 return -ENOMEM;
1812
1813 BUG_ON(pmd_huge(*pmd));
1814
1815 arch_enter_lazy_mmu_mode();
1816
1817 token = pmd_pgtable(*pmd);
1818
1819 do {
1820 err = fn(pte++, token, addr, data);
1821 if (err)
1822 break;
1823 } while (addr += PAGE_SIZE, addr != end);
1824
1825 arch_leave_lazy_mmu_mode();
1826
1827 if (mm != &init_mm)
1828 pte_unmap_unlock(pte-1, ptl);
1829 return err;
1830}
1831
1832static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
1833 unsigned long addr, unsigned long end,
1834 pte_fn_t fn, void *data)
1835{
1836 pmd_t *pmd;
1837 unsigned long next;
1838 int err;
1839
1840 BUG_ON(pud_huge(*pud));
1841
1842 pmd = pmd_alloc(mm, pud, addr);
1843 if (!pmd)
1844 return -ENOMEM;
1845 do {
1846 next = pmd_addr_end(addr, end);
1847 err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
1848 if (err)
1849 break;
1850 } while (pmd++, addr = next, addr != end);
1851 return err;
1852}
1853
1854static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd,
1855 unsigned long addr, unsigned long end,
1856 pte_fn_t fn, void *data)
1857{
1858 pud_t *pud;
1859 unsigned long next;
1860 int err;
1861
1862 pud = pud_alloc(mm, pgd, addr);
1863 if (!pud)
1864 return -ENOMEM;
1865 do {
1866 next = pud_addr_end(addr, end);
1867 err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
1868 if (err)
1869 break;
1870 } while (pud++, addr = next, addr != end);
1871 return err;
1872}
1873
1874
1875
1876
1877
1878int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
1879 unsigned long size, pte_fn_t fn, void *data)
1880{
1881 pgd_t *pgd;
1882 unsigned long next;
1883 unsigned long end = addr + size;
1884 int err;
1885
1886 BUG_ON(addr >= end);
1887 pgd = pgd_offset(mm, addr);
1888 do {
1889 next = pgd_addr_end(addr, end);
1890 err = apply_to_pud_range(mm, pgd, addr, next, fn, data);
1891 if (err)
1892 break;
1893 } while (pgd++, addr = next, addr != end);
1894
1895 return err;
1896}
1897EXPORT_SYMBOL_GPL(apply_to_page_range);
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
1908 pte_t *page_table, pte_t orig_pte)
1909{
1910 int same = 1;
1911#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
1912 if (sizeof(pte_t) > sizeof(unsigned long)) {
1913 spinlock_t *ptl = pte_lockptr(mm, pmd);
1914 spin_lock(ptl);
1915 same = pte_same(*page_table, orig_pte);
1916 spin_unlock(ptl);
1917 }
1918#endif
1919 pte_unmap(page_table);
1920 return same;
1921}
1922
1923static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
1924{
1925 debug_dma_assert_idle(src);
1926
1927
1928
1929
1930
1931
1932
1933 if (unlikely(!src)) {
1934 void *kaddr = kmap_atomic(dst);
1935 void __user *uaddr = (void __user *)(va & PAGE_MASK);
1936
1937
1938
1939
1940
1941
1942
1943 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
1944 clear_page(kaddr);
1945 kunmap_atomic(kaddr);
1946 flush_dcache_page(dst);
1947 } else
1948 copy_user_highpage(dst, src, va, vma);
1949}
1950
1951
1952
1953
1954
1955
1956
1957static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
1958 unsigned long address)
1959{
1960 struct vm_fault vmf;
1961 int ret;
1962
1963 vmf.virtual_address = (void __user *)(address & PAGE_MASK);
1964 vmf.pgoff = page->index;
1965 vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
1966 vmf.page = page;
1967 vmf.cow_page = NULL;
1968
1969 ret = vma->vm_ops->page_mkwrite(vma, &vmf);
1970 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
1971 return ret;
1972 if (unlikely(!(ret & VM_FAULT_LOCKED))) {
1973 lock_page(page);
1974 if (!page->mapping) {
1975 unlock_page(page);
1976 return 0;
1977 }
1978 ret |= VM_FAULT_LOCKED;
1979 } else
1980 VM_BUG_ON_PAGE(!PageLocked(page), page);
1981 return ret;
1982}
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992static inline int wp_page_reuse(struct mm_struct *mm,
1993 struct vm_area_struct *vma, unsigned long address,
1994 pte_t *page_table, spinlock_t *ptl, pte_t orig_pte,
1995 struct page *page, int page_mkwrite,
1996 int dirty_shared)
1997 __releases(ptl)
1998{
1999 pte_t entry;
2000
2001
2002
2003
2004
2005 if (page)
2006 page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);
2007
2008 flush_cache_page(vma, address, pte_pfn(orig_pte));
2009 entry = pte_mkyoung(orig_pte);
2010 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2011 if (ptep_set_access_flags(vma, address, page_table, entry, 1))
2012 update_mmu_cache(vma, address, page_table);
2013 pte_unmap_unlock(page_table, ptl);
2014
2015 if (dirty_shared) {
2016 struct address_space *mapping;
2017 int dirtied;
2018
2019 if (!page_mkwrite)
2020 lock_page(page);
2021
2022 dirtied = set_page_dirty(page);
2023 VM_BUG_ON_PAGE(PageAnon(page), page);
2024 mapping = page->mapping;
2025 unlock_page(page);
2026 page_cache_release(page);
2027
2028 if ((dirtied || page_mkwrite) && mapping) {
2029
2030
2031
2032
2033 balance_dirty_pages_ratelimited(mapping);
2034 }
2035
2036 if (!page_mkwrite)
2037 file_update_time(vma->vm_file);
2038 }
2039
2040 return VM_FAULT_WRITE;
2041}
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
2060 unsigned long address, pte_t *page_table, pmd_t *pmd,
2061 pte_t orig_pte, struct page *old_page)
2062{
2063 struct page *new_page = NULL;
2064 spinlock_t *ptl = NULL;
2065 pte_t entry;
2066 int page_copied = 0;
2067 const unsigned long mmun_start = address & PAGE_MASK;
2068 const unsigned long mmun_end = mmun_start + PAGE_SIZE;
2069 struct mem_cgroup *memcg;
2070
2071 if (unlikely(anon_vma_prepare(vma)))
2072 goto oom;
2073
2074 if (is_zero_pfn(pte_pfn(orig_pte))) {
2075 new_page = alloc_zeroed_user_highpage_movable(vma, address);
2076 if (!new_page)
2077 goto oom;
2078 } else {
2079 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
2080 if (!new_page)
2081 goto oom;
2082 cow_user_page(new_page, old_page, address, vma);
2083 }
2084 __SetPageUptodate(new_page);
2085
2086 if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg))
2087 goto oom_free_new;
2088
2089 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2090
2091
2092
2093
2094 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2095 if (likely(pte_same(*page_table, orig_pte))) {
2096 if (old_page) {
2097 if (!PageAnon(old_page)) {
2098 dec_mm_counter_fast(mm, MM_FILEPAGES);
2099 inc_mm_counter_fast(mm, MM_ANONPAGES);
2100 }
2101 } else {
2102 inc_mm_counter_fast(mm, MM_ANONPAGES);
2103 }
2104 flush_cache_page(vma, address, pte_pfn(orig_pte));
2105 entry = mk_pte(new_page, vma->vm_page_prot);
2106 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2107
2108
2109
2110
2111
2112
2113 ptep_clear_flush_notify(vma, address, page_table);
2114 page_add_new_anon_rmap(new_page, vma, address);
2115 mem_cgroup_commit_charge(new_page, memcg, false);
2116 lru_cache_add_active_or_unevictable(new_page, vma);
2117
2118
2119
2120
2121
2122 set_pte_at_notify(mm, address, page_table, entry);
2123 update_mmu_cache(vma, address, page_table);
2124 if (old_page) {
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147 page_remove_rmap(old_page);
2148 }
2149
2150
2151 new_page = old_page;
2152 page_copied = 1;
2153 } else {
2154 mem_cgroup_cancel_charge(new_page, memcg);
2155 }
2156
2157 if (new_page)
2158 page_cache_release(new_page);
2159
2160 pte_unmap_unlock(page_table, ptl);
2161 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2162 if (old_page) {
2163
2164
2165
2166
2167 if (page_copied && (vma->vm_flags & VM_LOCKED)) {
2168 lock_page(old_page);
2169 munlock_vma_page(old_page);
2170 unlock_page(old_page);
2171 }
2172 page_cache_release(old_page);
2173 }
2174 return page_copied ? VM_FAULT_WRITE : 0;
2175oom_free_new:
2176 page_cache_release(new_page);
2177oom:
2178 if (old_page)
2179 page_cache_release(old_page);
2180 return VM_FAULT_OOM;
2181}
2182
2183
2184
2185
2186
2187static int wp_pfn_shared(struct mm_struct *mm,
2188 struct vm_area_struct *vma, unsigned long address,
2189 pte_t *page_table, spinlock_t *ptl, pte_t orig_pte,
2190 pmd_t *pmd)
2191{
2192 if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
2193 struct vm_fault vmf = {
2194 .page = NULL,
2195 .pgoff = linear_page_index(vma, address),
2196 .virtual_address = (void __user *)(address & PAGE_MASK),
2197 .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE,
2198 };
2199 int ret;
2200
2201 pte_unmap_unlock(page_table, ptl);
2202 ret = vma->vm_ops->pfn_mkwrite(vma, &vmf);
2203 if (ret & VM_FAULT_ERROR)
2204 return ret;
2205 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2206
2207
2208
2209
2210 if (!pte_same(*page_table, orig_pte)) {
2211 pte_unmap_unlock(page_table, ptl);
2212 return 0;
2213 }
2214 }
2215 return wp_page_reuse(mm, vma, address, page_table, ptl, orig_pte,
2216 NULL, 0, 0);
2217}
2218
2219static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma,
2220 unsigned long address, pte_t *page_table,
2221 pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte,
2222 struct page *old_page)
2223 __releases(ptl)
2224{
2225 int page_mkwrite = 0;
2226
2227 page_cache_get(old_page);
2228
2229
2230
2231
2232
2233
2234 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
2235 int tmp;
2236
2237 pte_unmap_unlock(page_table, ptl);
2238 tmp = do_page_mkwrite(vma, old_page, address);
2239 if (unlikely(!tmp || (tmp &
2240 (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
2241 page_cache_release(old_page);
2242 return tmp;
2243 }
2244
2245
2246
2247
2248
2249
2250 page_table = pte_offset_map_lock(mm, pmd, address,
2251 &ptl);
2252 if (!pte_same(*page_table, orig_pte)) {
2253 unlock_page(old_page);
2254 pte_unmap_unlock(page_table, ptl);
2255 page_cache_release(old_page);
2256 return 0;
2257 }
2258 page_mkwrite = 1;
2259 }
2260
2261 return wp_page_reuse(mm, vma, address, page_table, ptl,
2262 orig_pte, old_page, page_mkwrite, 1);
2263}
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2284 unsigned long address, pte_t *page_table, pmd_t *pmd,
2285 spinlock_t *ptl, pte_t orig_pte)
2286 __releases(ptl)
2287{
2288 struct page *old_page;
2289
2290 old_page = vm_normal_page(vma, address, orig_pte);
2291 if (!old_page) {
2292
2293
2294
2295
2296
2297
2298
2299 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2300 (VM_WRITE|VM_SHARED))
2301 return wp_pfn_shared(mm, vma, address, page_table, ptl,
2302 orig_pte, pmd);
2303
2304 pte_unmap_unlock(page_table, ptl);
2305 return wp_page_copy(mm, vma, address, page_table, pmd,
2306 orig_pte, old_page);
2307 }
2308
2309
2310
2311
2312
2313 if (PageAnon(old_page) && !PageKsm(old_page)) {
2314 if (!trylock_page(old_page)) {
2315 page_cache_get(old_page);
2316 pte_unmap_unlock(page_table, ptl);
2317 lock_page(old_page);
2318 page_table = pte_offset_map_lock(mm, pmd, address,
2319 &ptl);
2320 if (!pte_same(*page_table, orig_pte)) {
2321 unlock_page(old_page);
2322 pte_unmap_unlock(page_table, ptl);
2323 page_cache_release(old_page);
2324 return 0;
2325 }
2326 page_cache_release(old_page);
2327 }
2328 if (reuse_swap_page(old_page)) {
2329
2330
2331
2332
2333
2334 page_move_anon_rmap(old_page, vma, address);
2335 unlock_page(old_page);
2336 return wp_page_reuse(mm, vma, address, page_table, ptl,
2337 orig_pte, old_page, 0, 0);
2338 }
2339 unlock_page(old_page);
2340 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2341 (VM_WRITE|VM_SHARED))) {
2342 return wp_page_shared(mm, vma, address, page_table, pmd,
2343 ptl, orig_pte, old_page);
2344 }
2345
2346
2347
2348
2349 page_cache_get(old_page);
2350
2351 pte_unmap_unlock(page_table, ptl);
2352 return wp_page_copy(mm, vma, address, page_table, pmd,
2353 orig_pte, old_page);
2354}
2355
2356static void unmap_mapping_range_vma(struct vm_area_struct *vma,
2357 unsigned long start_addr, unsigned long end_addr,
2358 struct zap_details *details)
2359{
2360 zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
2361}
2362
2363static inline void unmap_mapping_range_tree(struct rb_root *root,
2364 struct zap_details *details)
2365{
2366 struct vm_area_struct *vma;
2367 pgoff_t vba, vea, zba, zea;
2368
2369 vma_interval_tree_foreach(vma, root,
2370 details->first_index, details->last_index) {
2371
2372 vba = vma->vm_pgoff;
2373 vea = vba + vma_pages(vma) - 1;
2374
2375 zba = details->first_index;
2376 if (zba < vba)
2377 zba = vba;
2378 zea = details->last_index;
2379 if (zea > vea)
2380 zea = vea;
2381
2382 unmap_mapping_range_vma(vma,
2383 ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
2384 ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
2385 details);
2386 }
2387}
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406void unmap_mapping_range(struct address_space *mapping,
2407 loff_t const holebegin, loff_t const holelen, int even_cows)
2408{
2409 struct zap_details details;
2410 pgoff_t hba = holebegin >> PAGE_SHIFT;
2411 pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2412
2413
2414 if (sizeof(holelen) > sizeof(hlen)) {
2415 long long holeend =
2416 (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2417 if (holeend & ~(long long)ULONG_MAX)
2418 hlen = ULONG_MAX - hba + 1;
2419 }
2420
2421 details.check_mapping = even_cows? NULL: mapping;
2422 details.first_index = hba;
2423 details.last_index = hba + hlen - 1;
2424 if (details.last_index < details.first_index)
2425 details.last_index = ULONG_MAX;
2426
2427
2428
2429 i_mmap_lock_write(mapping);
2430 if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
2431 unmap_mapping_range_tree(&mapping->i_mmap, &details);
2432 i_mmap_unlock_write(mapping);
2433}
2434EXPORT_SYMBOL(unmap_mapping_range);
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2445 unsigned long address, pte_t *page_table, pmd_t *pmd,
2446 unsigned int flags, pte_t orig_pte)
2447{
2448 spinlock_t *ptl;
2449 struct page *page, *swapcache;
2450 struct mem_cgroup *memcg;
2451 swp_entry_t entry;
2452 pte_t pte;
2453 int locked;
2454 int exclusive = 0;
2455 int ret = 0;
2456
2457 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
2458 goto out;
2459
2460 entry = pte_to_swp_entry(orig_pte);
2461 if (unlikely(non_swap_entry(entry))) {
2462 if (is_migration_entry(entry)) {
2463 migration_entry_wait(mm, pmd, address);
2464 } else if (is_hwpoison_entry(entry)) {
2465 ret = VM_FAULT_HWPOISON;
2466 } else {
2467 print_bad_pte(vma, address, orig_pte, NULL);
2468 ret = VM_FAULT_SIGBUS;
2469 }
2470 goto out;
2471 }
2472 delayacct_set_flag(DELAYACCT_PF_SWAPIN);
2473 page = lookup_swap_cache(entry);
2474 if (!page) {
2475 page = swapin_readahead(entry,
2476 GFP_HIGHUSER_MOVABLE, vma, address);
2477 if (!page) {
2478
2479
2480
2481
2482 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2483 if (likely(pte_same(*page_table, orig_pte)))
2484 ret = VM_FAULT_OOM;
2485 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2486 goto unlock;
2487 }
2488
2489
2490 ret = VM_FAULT_MAJOR;
2491 count_vm_event(PGMAJFAULT);
2492 mem_cgroup_count_vm_event(mm, PGMAJFAULT);
2493 } else if (PageHWPoison(page)) {
2494
2495
2496
2497
2498 ret = VM_FAULT_HWPOISON;
2499 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2500 swapcache = page;
2501 goto out_release;
2502 }
2503
2504 swapcache = page;
2505 locked = lock_page_or_retry(page, mm, flags);
2506
2507 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2508 if (!locked) {
2509 ret |= VM_FAULT_RETRY;
2510 goto out_release;
2511 }
2512
2513
2514
2515
2516
2517
2518
2519 if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
2520 goto out_page;
2521
2522 page = ksm_might_need_to_copy(page, vma, address);
2523 if (unlikely(!page)) {
2524 ret = VM_FAULT_OOM;
2525 page = swapcache;
2526 goto out_page;
2527 }
2528
2529 if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) {
2530 ret = VM_FAULT_OOM;
2531 goto out_page;
2532 }
2533
2534
2535
2536
2537 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2538 if (unlikely(!pte_same(*page_table, orig_pte)))
2539 goto out_nomap;
2540
2541 if (unlikely(!PageUptodate(page))) {
2542 ret = VM_FAULT_SIGBUS;
2543 goto out_nomap;
2544 }
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556 inc_mm_counter_fast(mm, MM_ANONPAGES);
2557 dec_mm_counter_fast(mm, MM_SWAPENTS);
2558 pte = mk_pte(page, vma->vm_page_prot);
2559 if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
2560 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
2561 flags &= ~FAULT_FLAG_WRITE;
2562 ret |= VM_FAULT_WRITE;
2563 exclusive = 1;
2564 }
2565 flush_icache_page(vma, page);
2566 if (pte_swp_soft_dirty(orig_pte))
2567 pte = pte_mksoft_dirty(pte);
2568 set_pte_at(mm, address, page_table, pte);
2569 if (page == swapcache) {
2570 do_page_add_anon_rmap(page, vma, address, exclusive);
2571 mem_cgroup_commit_charge(page, memcg, true);
2572 } else {
2573 page_add_new_anon_rmap(page, vma, address);
2574 mem_cgroup_commit_charge(page, memcg, false);
2575 lru_cache_add_active_or_unevictable(page, vma);
2576 }
2577
2578 swap_free(entry);
2579 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
2580 try_to_free_swap(page);
2581 unlock_page(page);
2582 if (page != swapcache) {
2583
2584
2585
2586
2587
2588
2589
2590
2591 unlock_page(swapcache);
2592 page_cache_release(swapcache);
2593 }
2594
2595 if (flags & FAULT_FLAG_WRITE) {
2596 ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
2597 if (ret & VM_FAULT_ERROR)
2598 ret &= VM_FAULT_ERROR;
2599 goto out;
2600 }
2601
2602
2603 update_mmu_cache(vma, address, page_table);
2604unlock:
2605 pte_unmap_unlock(page_table, ptl);
2606out:
2607 return ret;
2608out_nomap:
2609 mem_cgroup_cancel_charge(page, memcg);
2610 pte_unmap_unlock(page_table, ptl);
2611out_page:
2612 unlock_page(page);
2613out_release:
2614 page_cache_release(page);
2615 if (page != swapcache) {
2616 unlock_page(swapcache);
2617 page_cache_release(swapcache);
2618 }
2619 return ret;
2620}
2621
2622
2623
2624
2625
2626
2627static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address)
2628{
2629 address &= PAGE_MASK;
2630 if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) {
2631 struct vm_area_struct *prev = vma->vm_prev;
2632
2633
2634
2635
2636
2637
2638
2639 if (prev && prev->vm_end == address)
2640 return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM;
2641
2642 return expand_downwards(vma, address - PAGE_SIZE);
2643 }
2644 if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) {
2645 struct vm_area_struct *next = vma->vm_next;
2646
2647
2648 if (next && next->vm_start == address + PAGE_SIZE)
2649 return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM;
2650
2651 return expand_upwards(vma, address + PAGE_SIZE);
2652 }
2653 return 0;
2654}
2655
2656
2657
2658
2659
2660
2661static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2662 unsigned long address, pte_t *page_table, pmd_t *pmd,
2663 unsigned int flags)
2664{
2665 struct mem_cgroup *memcg;
2666 struct page *page;
2667 spinlock_t *ptl;
2668 pte_t entry;
2669
2670 pte_unmap(page_table);
2671
2672
2673 if (check_stack_guard_page(vma, address) < 0)
2674 return VM_FAULT_SIGSEGV;
2675
2676
2677 if (!(flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(mm)) {
2678 entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
2679 vma->vm_page_prot));
2680 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2681 if (!pte_none(*page_table))
2682 goto unlock;
2683 goto setpte;
2684 }
2685
2686
2687 if (unlikely(anon_vma_prepare(vma)))
2688 goto oom;
2689 page = alloc_zeroed_user_highpage_movable(vma, address);
2690 if (!page)
2691 goto oom;
2692
2693
2694
2695
2696
2697 __SetPageUptodate(page);
2698
2699 if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg))
2700 goto oom_free_page;
2701
2702 entry = mk_pte(page, vma->vm_page_prot);
2703 if (vma->vm_flags & VM_WRITE)
2704 entry = pte_mkwrite(pte_mkdirty(entry));
2705
2706 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2707 if (!pte_none(*page_table))
2708 goto release;
2709
2710 inc_mm_counter_fast(mm, MM_ANONPAGES);
2711 page_add_new_anon_rmap(page, vma, address);
2712 mem_cgroup_commit_charge(page, memcg, false);
2713 lru_cache_add_active_or_unevictable(page, vma);
2714setpte:
2715 set_pte_at(mm, address, page_table, entry);
2716
2717
2718 update_mmu_cache(vma, address, page_table);
2719unlock:
2720 pte_unmap_unlock(page_table, ptl);
2721 return 0;
2722release:
2723 mem_cgroup_cancel_charge(page, memcg);
2724 page_cache_release(page);
2725 goto unlock;
2726oom_free_page:
2727 page_cache_release(page);
2728oom:
2729 return VM_FAULT_OOM;
2730}
2731
2732
2733
2734
2735
2736
2737static int __do_fault(struct vm_area_struct *vma, unsigned long address,
2738 pgoff_t pgoff, unsigned int flags,
2739 struct page *cow_page, struct page **page)
2740{
2741 struct vm_fault vmf;
2742 int ret;
2743
2744 vmf.virtual_address = (void __user *)(address & PAGE_MASK);
2745 vmf.pgoff = pgoff;
2746 vmf.flags = flags;
2747 vmf.page = NULL;
2748 vmf.cow_page = cow_page;
2749
2750 ret = vma->vm_ops->fault(vma, &vmf);
2751 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
2752 return ret;
2753 if (!vmf.page)
2754 goto out;
2755
2756 if (unlikely(PageHWPoison(vmf.page))) {
2757 if (ret & VM_FAULT_LOCKED)
2758 unlock_page(vmf.page);
2759 page_cache_release(vmf.page);
2760 return VM_FAULT_HWPOISON;
2761 }
2762
2763 if (unlikely(!(ret & VM_FAULT_LOCKED)))
2764 lock_page(vmf.page);
2765 else
2766 VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page);
2767
2768 out:
2769 *page = vmf.page;
2770 return ret;
2771}
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788void do_set_pte(struct vm_area_struct *vma, unsigned long address,
2789 struct page *page, pte_t *pte, bool write, bool anon)
2790{
2791 pte_t entry;
2792
2793 flush_icache_page(vma, page);
2794 entry = mk_pte(page, vma->vm_page_prot);
2795 if (write)
2796 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2797 if (anon) {
2798 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
2799 page_add_new_anon_rmap(page, vma, address);
2800 } else {
2801 inc_mm_counter_fast(vma->vm_mm, MM_FILEPAGES);
2802 page_add_file_rmap(page);
2803 }
2804 set_pte_at(vma->vm_mm, address, pte, entry);
2805
2806
2807 update_mmu_cache(vma, address, pte);
2808}
2809
2810static unsigned long fault_around_bytes __read_mostly =
2811 rounddown_pow_of_two(65536);
2812
2813#ifdef CONFIG_DEBUG_FS
2814static int fault_around_bytes_get(void *data, u64 *val)
2815{
2816 *val = fault_around_bytes;
2817 return 0;
2818}
2819
2820
2821
2822
2823
2824
2825static int fault_around_bytes_set(void *data, u64 val)
2826{
2827 if (val / PAGE_SIZE > PTRS_PER_PTE)
2828 return -EINVAL;
2829 if (val > PAGE_SIZE)
2830 fault_around_bytes = rounddown_pow_of_two(val);
2831 else
2832 fault_around_bytes = PAGE_SIZE;
2833 return 0;
2834}
2835DEFINE_SIMPLE_ATTRIBUTE(fault_around_bytes_fops,
2836 fault_around_bytes_get, fault_around_bytes_set, "%llu\n");
2837
2838static int __init fault_around_debugfs(void)
2839{
2840 void *ret;
2841
2842 ret = debugfs_create_file("fault_around_bytes", 0644, NULL, NULL,
2843 &fault_around_bytes_fops);
2844 if (!ret)
2845 pr_warn("Failed to create fault_around_bytes in debugfs");
2846 return 0;
2847}
2848late_initcall(fault_around_debugfs);
2849#endif
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874static void do_fault_around(struct vm_area_struct *vma, unsigned long address,
2875 pte_t *pte, pgoff_t pgoff, unsigned int flags)
2876{
2877 unsigned long start_addr, nr_pages, mask;
2878 pgoff_t max_pgoff;
2879 struct vm_fault vmf;
2880 int off;
2881
2882 nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
2883 mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
2884
2885 start_addr = max(address & mask, vma->vm_start);
2886 off = ((address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
2887 pte -= off;
2888 pgoff -= off;
2889
2890
2891
2892
2893
2894 max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
2895 PTRS_PER_PTE - 1;
2896 max_pgoff = min3(max_pgoff, vma_pages(vma) + vma->vm_pgoff - 1,
2897 pgoff + nr_pages - 1);
2898
2899
2900 while (!pte_none(*pte)) {
2901 if (++pgoff > max_pgoff)
2902 return;
2903 start_addr += PAGE_SIZE;
2904 if (start_addr >= vma->vm_end)
2905 return;
2906 pte++;
2907 }
2908
2909 vmf.virtual_address = (void __user *) start_addr;
2910 vmf.pte = pte;
2911 vmf.pgoff = pgoff;
2912 vmf.max_pgoff = max_pgoff;
2913 vmf.flags = flags;
2914 vma->vm_ops->map_pages(vma, &vmf);
2915}
2916
2917static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2918 unsigned long address, pmd_t *pmd,
2919 pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
2920{
2921 struct page *fault_page;
2922 spinlock_t *ptl;
2923 pte_t *pte;
2924 int ret = 0;
2925
2926
2927
2928
2929
2930
2931 if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
2932 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
2933 do_fault_around(vma, address, pte, pgoff, flags);
2934 if (!pte_same(*pte, orig_pte))
2935 goto unlock_out;
2936 pte_unmap_unlock(pte, ptl);
2937 }
2938
2939 ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);
2940 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
2941 return ret;
2942
2943 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
2944 if (unlikely(!pte_same(*pte, orig_pte))) {
2945 pte_unmap_unlock(pte, ptl);
2946 unlock_page(fault_page);
2947 page_cache_release(fault_page);
2948 return ret;
2949 }
2950 do_set_pte(vma, address, fault_page, pte, false, false);
2951 unlock_page(fault_page);
2952unlock_out:
2953 pte_unmap_unlock(pte, ptl);
2954 return ret;
2955}
2956
2957static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2958 unsigned long address, pmd_t *pmd,
2959 pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
2960{
2961 struct page *fault_page, *new_page;
2962 struct mem_cgroup *memcg;
2963 spinlock_t *ptl;
2964 pte_t *pte;
2965 int ret;
2966
2967 if (unlikely(anon_vma_prepare(vma)))
2968 return VM_FAULT_OOM;
2969
2970 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
2971 if (!new_page)
2972 return VM_FAULT_OOM;
2973
2974 if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) {
2975 page_cache_release(new_page);
2976 return VM_FAULT_OOM;
2977 }
2978
2979 ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page);
2980 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
2981 goto uncharge_out;
2982
2983 if (fault_page)
2984 copy_user_highpage(new_page, fault_page, address, vma);
2985 __SetPageUptodate(new_page);
2986
2987 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
2988 if (unlikely(!pte_same(*pte, orig_pte))) {
2989 pte_unmap_unlock(pte, ptl);
2990 if (fault_page) {
2991 unlock_page(fault_page);
2992 page_cache_release(fault_page);
2993 } else {
2994
2995
2996
2997
2998 i_mmap_unlock_read(vma->vm_file->f_mapping);
2999 }
3000 goto uncharge_out;
3001 }
3002 do_set_pte(vma, address, new_page, pte, true, true);
3003 mem_cgroup_commit_charge(new_page, memcg, false);
3004 lru_cache_add_active_or_unevictable(new_page, vma);
3005 pte_unmap_unlock(pte, ptl);
3006 if (fault_page) {
3007 unlock_page(fault_page);
3008 page_cache_release(fault_page);
3009 } else {
3010
3011
3012
3013
3014 i_mmap_unlock_read(vma->vm_file->f_mapping);
3015 }
3016 return ret;
3017uncharge_out:
3018 mem_cgroup_cancel_charge(new_page, memcg);
3019 page_cache_release(new_page);
3020 return ret;
3021}
3022
3023static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3024 unsigned long address, pmd_t *pmd,
3025 pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
3026{
3027 struct page *fault_page;
3028 struct address_space *mapping;
3029 spinlock_t *ptl;
3030 pte_t *pte;
3031 int dirtied = 0;
3032 int ret, tmp;
3033
3034 ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);
3035 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3036 return ret;
3037
3038
3039
3040
3041
3042 if (vma->vm_ops->page_mkwrite) {
3043 unlock_page(fault_page);
3044 tmp = do_page_mkwrite(vma, fault_page, address);
3045 if (unlikely(!tmp ||
3046 (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
3047 page_cache_release(fault_page);
3048 return tmp;
3049 }
3050 }
3051
3052 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
3053 if (unlikely(!pte_same(*pte, orig_pte))) {
3054 pte_unmap_unlock(pte, ptl);
3055 unlock_page(fault_page);
3056 page_cache_release(fault_page);
3057 return ret;
3058 }
3059 do_set_pte(vma, address, fault_page, pte, true, false);
3060 pte_unmap_unlock(pte, ptl);
3061
3062 if (set_page_dirty(fault_page))
3063 dirtied = 1;
3064
3065
3066
3067
3068
3069
3070 mapping = fault_page->mapping;
3071 unlock_page(fault_page);
3072 if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) {
3073
3074
3075
3076
3077 balance_dirty_pages_ratelimited(mapping);
3078 }
3079
3080 if (!vma->vm_ops->page_mkwrite)
3081 file_update_time(vma->vm_file);
3082
3083 return ret;
3084}
3085
3086
3087
3088
3089
3090
3091
3092static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3093 unsigned long address, pte_t *page_table, pmd_t *pmd,
3094 unsigned int flags, pte_t orig_pte)
3095{
3096 pgoff_t pgoff = (((address & PAGE_MASK)
3097 - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
3098
3099 pte_unmap(page_table);
3100 if (!(flags & FAULT_FLAG_WRITE))
3101 return do_read_fault(mm, vma, address, pmd, pgoff, flags,
3102 orig_pte);
3103 if (!(vma->vm_flags & VM_SHARED))
3104 return do_cow_fault(mm, vma, address, pmd, pgoff, flags,
3105 orig_pte);
3106 return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
3107}
3108
3109static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
3110 unsigned long addr, int page_nid,
3111 int *flags)
3112{
3113 get_page(page);
3114
3115 count_vm_numa_event(NUMA_HINT_FAULTS);
3116 if (page_nid == numa_node_id()) {
3117 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
3118 *flags |= TNF_FAULT_LOCAL;
3119 }
3120
3121 return mpol_misplaced(page, vma, addr);
3122}
3123
3124static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3125 unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd)
3126{
3127 struct page *page = NULL;
3128 spinlock_t *ptl;
3129 int page_nid = -1;
3130 int last_cpupid;
3131 int target_nid;
3132 bool migrated = false;
3133 bool was_writable = pte_write(pte);
3134 int flags = 0;
3135
3136
3137 BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)));
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148 ptl = pte_lockptr(mm, pmd);
3149 spin_lock(ptl);
3150 if (unlikely(!pte_same(*ptep, pte))) {
3151 pte_unmap_unlock(ptep, ptl);
3152 goto out;
3153 }
3154
3155
3156 pte = pte_modify(pte, vma->vm_page_prot);
3157 pte = pte_mkyoung(pte);
3158 if (was_writable)
3159 pte = pte_mkwrite(pte);
3160 set_pte_at(mm, addr, ptep, pte);
3161 update_mmu_cache(vma, addr, ptep);
3162
3163 page = vm_normal_page(vma, addr, pte);
3164 if (!page) {
3165 pte_unmap_unlock(ptep, ptl);
3166 return 0;
3167 }
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177 if (!(vma->vm_flags & VM_WRITE))
3178 flags |= TNF_NO_GROUP;
3179
3180
3181
3182
3183
3184 if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
3185 flags |= TNF_SHARED;
3186
3187 last_cpupid = page_cpupid_last(page);
3188 page_nid = page_to_nid(page);
3189 target_nid = numa_migrate_prep(page, vma, addr, page_nid, &flags);
3190 pte_unmap_unlock(ptep, ptl);
3191 if (target_nid == -1) {
3192 put_page(page);
3193 goto out;
3194 }
3195
3196
3197 migrated = migrate_misplaced_page(page, vma, target_nid);
3198 if (migrated) {
3199 page_nid = target_nid;
3200 flags |= TNF_MIGRATED;
3201 } else
3202 flags |= TNF_MIGRATE_FAIL;
3203
3204out:
3205 if (page_nid != -1)
3206 task_numa_fault(last_cpupid, page_nid, 1, flags);
3207 return 0;
3208}
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226static int handle_pte_fault(struct mm_struct *mm,
3227 struct vm_area_struct *vma, unsigned long address,
3228 pte_t *pte, pmd_t *pmd, unsigned int flags)
3229{
3230 pte_t entry;
3231 spinlock_t *ptl;
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241 entry = *pte;
3242 barrier();
3243 if (!pte_present(entry)) {
3244 if (pte_none(entry)) {
3245 if (vma->vm_ops) {
3246 if (likely(vma->vm_ops->fault))
3247 return do_fault(mm, vma, address, pte,
3248 pmd, flags, entry);
3249 }
3250 return do_anonymous_page(mm, vma, address,
3251 pte, pmd, flags);
3252 }
3253 return do_swap_page(mm, vma, address,
3254 pte, pmd, flags, entry);
3255 }
3256
3257 if (pte_protnone(entry))
3258 return do_numa_page(mm, vma, address, entry, pte, pmd);
3259
3260 ptl = pte_lockptr(mm, pmd);
3261 spin_lock(ptl);
3262 if (unlikely(!pte_same(*pte, entry)))
3263 goto unlock;
3264 if (flags & FAULT_FLAG_WRITE) {
3265 if (!pte_write(entry))
3266 return do_wp_page(mm, vma, address,
3267 pte, pmd, ptl, entry);
3268 entry = pte_mkdirty(entry);
3269 }
3270 entry = pte_mkyoung(entry);
3271 if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
3272 update_mmu_cache(vma, address, pte);
3273 } else {
3274
3275
3276
3277
3278
3279
3280 if (flags & FAULT_FLAG_WRITE)
3281 flush_tlb_fix_spurious_fault(vma, address);
3282 }
3283unlock:
3284 pte_unmap_unlock(pte, ptl);
3285 return 0;
3286}
3287
3288
3289
3290
3291
3292
3293
3294static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3295 unsigned long address, unsigned int flags)
3296{
3297 pgd_t *pgd;
3298 pud_t *pud;
3299 pmd_t *pmd;
3300 pte_t *pte;
3301
3302 if (unlikely(is_vm_hugetlb_page(vma)))
3303 return hugetlb_fault(mm, vma, address, flags);
3304
3305 pgd = pgd_offset(mm, address);
3306 pud = pud_alloc(mm, pgd, address);
3307 if (!pud)
3308 return VM_FAULT_OOM;
3309 pmd = pmd_alloc(mm, pud, address);
3310 if (!pmd)
3311 return VM_FAULT_OOM;
3312 if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
3313 int ret = VM_FAULT_FALLBACK;
3314 if (!vma->vm_ops)
3315 ret = do_huge_pmd_anonymous_page(mm, vma, address,
3316 pmd, flags);
3317 if (!(ret & VM_FAULT_FALLBACK))
3318 return ret;
3319 } else {
3320 pmd_t orig_pmd = *pmd;
3321 int ret;
3322
3323 barrier();
3324 if (pmd_trans_huge(orig_pmd)) {
3325 unsigned int dirty = flags & FAULT_FLAG_WRITE;
3326
3327
3328
3329
3330
3331
3332 if (pmd_trans_splitting(orig_pmd))
3333 return 0;
3334
3335 if (pmd_protnone(orig_pmd))
3336 return do_huge_pmd_numa_page(mm, vma, address,
3337 orig_pmd, pmd);
3338
3339 if (dirty && !pmd_write(orig_pmd)) {
3340 ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
3341 orig_pmd);
3342 if (!(ret & VM_FAULT_FALLBACK))
3343 return ret;
3344 } else {
3345 huge_pmd_set_accessed(mm, vma, address, pmd,
3346 orig_pmd, dirty);
3347 return 0;
3348 }
3349 }
3350 }
3351
3352
3353
3354
3355
3356
3357 if (unlikely(pmd_none(*pmd)) &&
3358 unlikely(__pte_alloc(mm, vma, pmd, address)))
3359 return VM_FAULT_OOM;
3360
3361 if (unlikely(pmd_trans_huge(*pmd)))
3362 return 0;
3363
3364
3365
3366
3367
3368
3369 pte = pte_offset_map(pmd, address);
3370
3371 return handle_pte_fault(mm, vma, address, pte, pmd, flags);
3372}
3373
3374
3375
3376
3377
3378
3379
3380int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3381 unsigned long address, unsigned int flags)
3382{
3383 int ret;
3384
3385 __set_current_state(TASK_RUNNING);
3386
3387 count_vm_event(PGFAULT);
3388 mem_cgroup_count_vm_event(mm, PGFAULT);
3389
3390
3391 check_sync_rss_stat(current);
3392
3393
3394
3395
3396
3397 if (flags & FAULT_FLAG_USER)
3398 mem_cgroup_oom_enable();
3399
3400 ret = __handle_mm_fault(mm, vma, address, flags);
3401
3402 if (flags & FAULT_FLAG_USER) {
3403 mem_cgroup_oom_disable();
3404
3405
3406
3407
3408
3409
3410 if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
3411 mem_cgroup_oom_synchronize(false);
3412 }
3413
3414 return ret;
3415}
3416EXPORT_SYMBOL_GPL(handle_mm_fault);
3417
3418#ifndef __PAGETABLE_PUD_FOLDED
3419
3420
3421
3422
3423int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
3424{
3425 pud_t *new = pud_alloc_one(mm, address);
3426 if (!new)
3427 return -ENOMEM;
3428
3429 smp_wmb();
3430
3431 spin_lock(&mm->page_table_lock);
3432 if (pgd_present(*pgd))
3433 pud_free(mm, new);
3434 else
3435 pgd_populate(mm, pgd, new);
3436 spin_unlock(&mm->page_table_lock);
3437 return 0;
3438}
3439#endif
3440
3441#ifndef __PAGETABLE_PMD_FOLDED
3442
3443
3444
3445
3446int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
3447{
3448 pmd_t *new = pmd_alloc_one(mm, address);
3449 if (!new)
3450 return -ENOMEM;
3451
3452 smp_wmb();
3453
3454 spin_lock(&mm->page_table_lock);
3455#ifndef __ARCH_HAS_4LEVEL_HACK
3456 if (!pud_present(*pud)) {
3457 mm_inc_nr_pmds(mm);
3458 pud_populate(mm, pud, new);
3459 } else
3460 pmd_free(mm, new);
3461#else
3462 if (!pgd_present(*pud)) {
3463 mm_inc_nr_pmds(mm);
3464 pgd_populate(mm, pud, new);
3465 } else
3466 pmd_free(mm, new);
3467#endif
3468 spin_unlock(&mm->page_table_lock);
3469 return 0;
3470}
3471#endif
3472
3473static int __follow_pte(struct mm_struct *mm, unsigned long address,
3474 pte_t **ptepp, spinlock_t **ptlp)
3475{
3476 pgd_t *pgd;
3477 pud_t *pud;
3478 pmd_t *pmd;
3479 pte_t *ptep;
3480
3481 pgd = pgd_offset(mm, address);
3482 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
3483 goto out;
3484
3485 pud = pud_offset(pgd, address);
3486 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
3487 goto out;
3488
3489 pmd = pmd_offset(pud, address);
3490 VM_BUG_ON(pmd_trans_huge(*pmd));
3491 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
3492 goto out;
3493
3494
3495 if (pmd_huge(*pmd))
3496 goto out;
3497
3498 ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
3499 if (!ptep)
3500 goto out;
3501 if (!pte_present(*ptep))
3502 goto unlock;
3503 *ptepp = ptep;
3504 return 0;
3505unlock:
3506 pte_unmap_unlock(ptep, *ptlp);
3507out:
3508 return -EINVAL;
3509}
3510
3511static inline int follow_pte(struct mm_struct *mm, unsigned long address,
3512 pte_t **ptepp, spinlock_t **ptlp)
3513{
3514 int res;
3515
3516
3517 (void) __cond_lock(*ptlp,
3518 !(res = __follow_pte(mm, address, ptepp, ptlp)));
3519 return res;
3520}
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532int follow_pfn(struct vm_area_struct *vma, unsigned long address,
3533 unsigned long *pfn)
3534{
3535 int ret = -EINVAL;
3536 spinlock_t *ptl;
3537 pte_t *ptep;
3538
3539 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
3540 return ret;
3541
3542 ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
3543 if (ret)
3544 return ret;
3545 *pfn = pte_pfn(*ptep);
3546 pte_unmap_unlock(ptep, ptl);
3547 return 0;
3548}
3549EXPORT_SYMBOL(follow_pfn);
3550
3551#ifdef CONFIG_HAVE_IOREMAP_PROT
3552int follow_phys(struct vm_area_struct *vma,
3553 unsigned long address, unsigned int flags,
3554 unsigned long *prot, resource_size_t *phys)
3555{
3556 int ret = -EINVAL;
3557 pte_t *ptep, pte;
3558 spinlock_t *ptl;
3559
3560 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
3561 goto out;
3562
3563 if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
3564 goto out;
3565 pte = *ptep;
3566
3567 if ((flags & FOLL_WRITE) && !pte_write(pte))
3568 goto unlock;
3569
3570 *prot = pgprot_val(pte_pgprot(pte));
3571 *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
3572
3573 ret = 0;
3574unlock:
3575 pte_unmap_unlock(ptep, ptl);
3576out:
3577 return ret;
3578}
3579
3580int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
3581 void *buf, int len, int write)
3582{
3583 resource_size_t phys_addr;
3584 unsigned long prot = 0;
3585 void __iomem *maddr;
3586 int offset = addr & (PAGE_SIZE-1);
3587
3588 if (follow_phys(vma, addr, write, &prot, &phys_addr))
3589 return -EINVAL;
3590
3591 maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
3592 if (write)
3593 memcpy_toio(maddr + offset, buf, len);
3594 else
3595 memcpy_fromio(buf, maddr + offset, len);
3596 iounmap(maddr);
3597
3598 return len;
3599}
3600EXPORT_SYMBOL_GPL(generic_access_phys);
3601#endif
3602
3603
3604
3605
3606
3607static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
3608 unsigned long addr, void *buf, int len, int write)
3609{
3610 struct vm_area_struct *vma;
3611 void *old_buf = buf;
3612
3613 down_read(&mm->mmap_sem);
3614
3615 while (len) {
3616 int bytes, ret, offset;
3617 void *maddr;
3618 struct page *page = NULL;
3619
3620 ret = get_user_pages(tsk, mm, addr, 1,
3621 write, 1, &page, &vma);
3622 if (ret <= 0) {
3623#ifndef CONFIG_HAVE_IOREMAP_PROT
3624 break;
3625#else
3626
3627
3628
3629
3630 vma = find_vma(mm, addr);
3631 if (!vma || vma->vm_start > addr)
3632 break;
3633 if (vma->vm_ops && vma->vm_ops->access)
3634 ret = vma->vm_ops->access(vma, addr, buf,
3635 len, write);
3636 if (ret <= 0)
3637 break;
3638 bytes = ret;
3639#endif
3640 } else {
3641 bytes = len;
3642 offset = addr & (PAGE_SIZE-1);
3643 if (bytes > PAGE_SIZE-offset)
3644 bytes = PAGE_SIZE-offset;
3645
3646 maddr = kmap(page);
3647 if (write) {
3648 copy_to_user_page(vma, page, addr,
3649 maddr + offset, buf, bytes);
3650 set_page_dirty_lock(page);
3651 } else {
3652 copy_from_user_page(vma, page, addr,
3653 buf, maddr + offset, bytes);
3654 }
3655 kunmap(page);
3656 page_cache_release(page);
3657 }
3658 len -= bytes;
3659 buf += bytes;
3660 addr += bytes;
3661 }
3662 up_read(&mm->mmap_sem);
3663
3664 return buf - old_buf;
3665}
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677int access_remote_vm(struct mm_struct *mm, unsigned long addr,
3678 void *buf, int len, int write)
3679{
3680 return __access_remote_vm(NULL, mm, addr, buf, len, write);
3681}
3682
3683
3684
3685
3686
3687
3688int access_process_vm(struct task_struct *tsk, unsigned long addr,
3689 void *buf, int len, int write)
3690{
3691 struct mm_struct *mm;
3692 int ret;
3693
3694 mm = get_task_mm(tsk);
3695 if (!mm)
3696 return 0;
3697
3698 ret = __access_remote_vm(tsk, mm, addr, buf, len, write);
3699 mmput(mm);
3700
3701 return ret;
3702}
3703
3704
3705
3706
3707void print_vma_addr(char *prefix, unsigned long ip)
3708{
3709 struct mm_struct *mm = current->mm;
3710 struct vm_area_struct *vma;
3711
3712
3713
3714
3715
3716 if (preempt_count())
3717 return;
3718
3719 down_read(&mm->mmap_sem);
3720 vma = find_vma(mm, ip);
3721 if (vma && vma->vm_file) {
3722 struct file *f = vma->vm_file;
3723 char *buf = (char *)__get_free_page(GFP_KERNEL);
3724 if (buf) {
3725 char *p;
3726
3727 p = d_path(&f->f_path, buf, PAGE_SIZE);
3728 if (IS_ERR(p))
3729 p = "?";
3730 printk("%s%s[%lx+%lx]", prefix, kbasename(p),
3731 vma->vm_start,
3732 vma->vm_end - vma->vm_start);
3733 free_page((unsigned long)buf);
3734 }
3735 }
3736 up_read(&mm->mmap_sem);
3737}
3738
3739#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
3740void might_fault(void)
3741{
3742
3743
3744
3745
3746
3747
3748 if (segment_eq(get_fs(), KERNEL_DS))
3749 return;
3750
3751
3752
3753
3754
3755
3756 if (in_atomic())
3757 return;
3758
3759 __might_sleep(__FILE__, __LINE__, 0);
3760
3761 if (current->mm)
3762 might_lock_read(¤t->mm->mmap_sem);
3763}
3764EXPORT_SYMBOL(might_fault);
3765#endif
3766
3767#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
3768static void clear_gigantic_page(struct page *page,
3769 unsigned long addr,
3770 unsigned int pages_per_huge_page)
3771{
3772 int i;
3773 struct page *p = page;
3774
3775 might_sleep();
3776 for (i = 0; i < pages_per_huge_page;
3777 i++, p = mem_map_next(p, page, i)) {
3778 cond_resched();
3779 clear_user_highpage(p, addr + i * PAGE_SIZE);
3780 }
3781}
3782void clear_huge_page(struct page *page,
3783 unsigned long addr, unsigned int pages_per_huge_page)
3784{
3785 int i;
3786
3787 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
3788 clear_gigantic_page(page, addr, pages_per_huge_page);
3789 return;
3790 }
3791
3792 might_sleep();
3793 for (i = 0; i < pages_per_huge_page; i++) {
3794 cond_resched();
3795 clear_user_highpage(page + i, addr + i * PAGE_SIZE);
3796 }
3797}
3798
3799static void copy_user_gigantic_page(struct page *dst, struct page *src,
3800 unsigned long addr,
3801 struct vm_area_struct *vma,
3802 unsigned int pages_per_huge_page)
3803{
3804 int i;
3805 struct page *dst_base = dst;
3806 struct page *src_base = src;
3807
3808 for (i = 0; i < pages_per_huge_page; ) {
3809 cond_resched();
3810 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
3811
3812 i++;
3813 dst = mem_map_next(dst, dst_base, i);
3814 src = mem_map_next(src, src_base, i);
3815 }
3816}
3817
3818void copy_user_huge_page(struct page *dst, struct page *src,
3819 unsigned long addr, struct vm_area_struct *vma,
3820 unsigned int pages_per_huge_page)
3821{
3822 int i;
3823
3824 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
3825 copy_user_gigantic_page(dst, src, addr, vma,
3826 pages_per_huge_page);
3827 return;
3828 }
3829
3830 might_sleep();
3831 for (i = 0; i < pages_per_huge_page; i++) {
3832 cond_resched();
3833 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
3834 }
3835}
3836#endif
3837
3838#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS
3839
3840static struct kmem_cache *page_ptl_cachep;
3841
3842void __init ptlock_cache_init(void)
3843{
3844 page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0,
3845 SLAB_PANIC, NULL);
3846}
3847
3848bool ptlock_alloc(struct page *page)
3849{
3850 spinlock_t *ptl;
3851
3852 ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
3853 if (!ptl)
3854 return false;
3855 page->ptl = ptl;
3856 return true;
3857}
3858
3859void ptlock_free(struct page *page)
3860{
3861 kmem_cache_free(page_ptl_cachep, page->ptl);
3862}
3863#endif
3864