1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41#include <linux/kernel_stat.h>
42#include <linux/mm.h>
43#include <linux/hugetlb.h>
44#include <linux/mman.h>
45#include <linux/swap.h>
46#include <linux/highmem.h>
47#include <linux/pagemap.h>
48#include <linux/ksm.h>
49#include <linux/rmap.h>
50#include <linux/export.h>
51#include <linux/delayacct.h>
52#include <linux/init.h>
53#include <linux/writeback.h>
54#include <linux/memcontrol.h>
55#include <linux/mmu_notifier.h>
56#include <linux/kallsyms.h>
57#include <linux/swapops.h>
58#include <linux/elf.h>
59#include <linux/gfp.h>
60#include <linux/migrate.h>
61#include <linux/string.h>
62#include <linux/dma-debug.h>
63#include <linux/debugfs.h>
64
65#include <asm/io.h>
66#include <asm/pgalloc.h>
67#include <asm/uaccess.h>
68#include <asm/tlb.h>
69#include <asm/tlbflush.h>
70#include <asm/pgtable.h>
71
72#include "internal.h"
73
74#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
75#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
76#endif
77
78#ifndef CONFIG_NEED_MULTIPLE_NODES
79
80unsigned long max_mapnr;
81struct page *mem_map;
82
83EXPORT_SYMBOL(max_mapnr);
84EXPORT_SYMBOL(mem_map);
85#endif
86
87
88
89
90
91
92
93
94void * high_memory;
95
96EXPORT_SYMBOL(high_memory);
97
98
99
100
101
102
103
104int randomize_va_space __read_mostly =
105#ifdef CONFIG_COMPAT_BRK
106 1;
107#else
108 2;
109#endif
110
111static int __init disable_randmaps(char *s)
112{
113 randomize_va_space = 0;
114 return 1;
115}
116__setup("norandmaps", disable_randmaps);
117
118unsigned long zero_pfn __read_mostly;
119unsigned long highest_memmap_pfn __read_mostly;
120
121EXPORT_SYMBOL(zero_pfn);
122
123
124
125
126static int __init init_zero_pfn(void)
127{
128 zero_pfn = page_to_pfn(ZERO_PAGE(0));
129 return 0;
130}
131core_initcall(init_zero_pfn);
132
133
134#if defined(SPLIT_RSS_COUNTING)
135
136void sync_mm_rss(struct mm_struct *mm)
137{
138 int i;
139
140 for (i = 0; i < NR_MM_COUNTERS; i++) {
141 if (current->rss_stat.count[i]) {
142 add_mm_counter(mm, i, current->rss_stat.count[i]);
143 current->rss_stat.count[i] = 0;
144 }
145 }
146 current->rss_stat.events = 0;
147}
148
149static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
150{
151 struct task_struct *task = current;
152
153 if (likely(task->mm == mm))
154 task->rss_stat.count[member] += val;
155 else
156 add_mm_counter(mm, member, val);
157}
158#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
159#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
160
161
162#define TASK_RSS_EVENTS_THRESH (64)
163static void check_sync_rss_stat(struct task_struct *task)
164{
165 if (unlikely(task != current))
166 return;
167 if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
168 sync_mm_rss(task->mm);
169}
170#else
171
172#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
173#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
174
175static void check_sync_rss_stat(struct task_struct *task)
176{
177}
178
179#endif
180
181#ifdef HAVE_GENERIC_MMU_GATHER
182
183static int tlb_next_batch(struct mmu_gather *tlb)
184{
185 struct mmu_gather_batch *batch;
186
187 batch = tlb->active;
188 if (batch->next) {
189 tlb->active = batch->next;
190 return 1;
191 }
192
193 if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
194 return 0;
195
196 batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
197 if (!batch)
198 return 0;
199
200 tlb->batch_count++;
201 batch->next = NULL;
202 batch->nr = 0;
203 batch->max = MAX_GATHER_BATCH;
204
205 tlb->active->next = batch;
206 tlb->active = batch;
207
208 return 1;
209}
210
211
212
213
214
215
216void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end)
217{
218 tlb->mm = mm;
219
220
221 tlb->fullmm = !(start | (end+1));
222 tlb->need_flush_all = 0;
223 tlb->local.next = NULL;
224 tlb->local.nr = 0;
225 tlb->local.max = ARRAY_SIZE(tlb->__pages);
226 tlb->active = &tlb->local;
227 tlb->batch_count = 0;
228
229#ifdef CONFIG_HAVE_RCU_TABLE_FREE
230 tlb->batch = NULL;
231#endif
232
233 __tlb_reset_range(tlb);
234}
235
236static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
237{
238 if (!tlb->end)
239 return;
240
241 tlb_flush(tlb);
242 mmu_notifier_invalidate_range(tlb->mm, tlb->start, tlb->end);
243#ifdef CONFIG_HAVE_RCU_TABLE_FREE
244 tlb_table_flush(tlb);
245#endif
246 __tlb_reset_range(tlb);
247}
248
249static void tlb_flush_mmu_free(struct mmu_gather *tlb)
250{
251 struct mmu_gather_batch *batch;
252
253 for (batch = &tlb->local; batch && batch->nr; batch = batch->next) {
254 free_pages_and_swap_cache(batch->pages, batch->nr);
255 batch->nr = 0;
256 }
257 tlb->active = &tlb->local;
258}
259
260void tlb_flush_mmu(struct mmu_gather *tlb)
261{
262 tlb_flush_mmu_tlbonly(tlb);
263 tlb_flush_mmu_free(tlb);
264}
265
266
267
268
269
270void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
271{
272 struct mmu_gather_batch *batch, *next;
273
274 tlb_flush_mmu(tlb);
275
276
277 check_pgt_cache();
278
279 for (batch = tlb->local.next; batch; batch = next) {
280 next = batch->next;
281 free_pages((unsigned long)batch, 0);
282 }
283 tlb->local.next = NULL;
284}
285
286
287
288
289
290
291
292int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
293{
294 struct mmu_gather_batch *batch;
295
296 VM_BUG_ON(!tlb->end);
297
298 batch = tlb->active;
299 batch->pages[batch->nr++] = page;
300 if (batch->nr == batch->max) {
301 if (!tlb_next_batch(tlb))
302 return 0;
303 batch = tlb->active;
304 }
305 VM_BUG_ON_PAGE(batch->nr > batch->max, page);
306
307 return batch->max - batch->nr;
308}
309
310#endif
311
312#ifdef CONFIG_HAVE_RCU_TABLE_FREE
313
314
315
316
317
318static void tlb_remove_table_smp_sync(void *arg)
319{
320
321}
322
323static void tlb_remove_table_one(void *table)
324{
325
326
327
328
329
330
331
332 smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
333 __tlb_remove_table(table);
334}
335
336static void tlb_remove_table_rcu(struct rcu_head *head)
337{
338 struct mmu_table_batch *batch;
339 int i;
340
341 batch = container_of(head, struct mmu_table_batch, rcu);
342
343 for (i = 0; i < batch->nr; i++)
344 __tlb_remove_table(batch->tables[i]);
345
346 free_page((unsigned long)batch);
347}
348
349void tlb_table_flush(struct mmu_gather *tlb)
350{
351 struct mmu_table_batch **batch = &tlb->batch;
352
353 if (*batch) {
354 call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
355 *batch = NULL;
356 }
357}
358
359void tlb_remove_table(struct mmu_gather *tlb, void *table)
360{
361 struct mmu_table_batch **batch = &tlb->batch;
362
363
364
365
366
367 if (atomic_read(&tlb->mm->mm_users) < 2) {
368 __tlb_remove_table(table);
369 return;
370 }
371
372 if (*batch == NULL) {
373 *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
374 if (*batch == NULL) {
375 tlb_remove_table_one(table);
376 return;
377 }
378 (*batch)->nr = 0;
379 }
380 (*batch)->tables[(*batch)->nr++] = table;
381 if ((*batch)->nr == MAX_TABLE_BATCH)
382 tlb_table_flush(tlb);
383}
384
385#endif
386
387
388
389
390
391static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
392 unsigned long addr)
393{
394 pgtable_t token = pmd_pgtable(*pmd);
395 pmd_clear(pmd);
396 pte_free_tlb(tlb, token, addr);
397 atomic_long_dec(&tlb->mm->nr_ptes);
398}
399
400static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
401 unsigned long addr, unsigned long end,
402 unsigned long floor, unsigned long ceiling)
403{
404 pmd_t *pmd;
405 unsigned long next;
406 unsigned long start;
407
408 start = addr;
409 pmd = pmd_offset(pud, addr);
410 do {
411 next = pmd_addr_end(addr, end);
412 if (pmd_none_or_clear_bad(pmd))
413 continue;
414 free_pte_range(tlb, pmd, addr);
415 } while (pmd++, addr = next, addr != end);
416
417 start &= PUD_MASK;
418 if (start < floor)
419 return;
420 if (ceiling) {
421 ceiling &= PUD_MASK;
422 if (!ceiling)
423 return;
424 }
425 if (end - 1 > ceiling - 1)
426 return;
427
428 pmd = pmd_offset(pud, start);
429 pud_clear(pud);
430 pmd_free_tlb(tlb, pmd, start);
431 mm_dec_nr_pmds(tlb->mm);
432}
433
434static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
435 unsigned long addr, unsigned long end,
436 unsigned long floor, unsigned long ceiling)
437{
438 pud_t *pud;
439 unsigned long next;
440 unsigned long start;
441
442 start = addr;
443 pud = pud_offset(pgd, addr);
444 do {
445 next = pud_addr_end(addr, end);
446 if (pud_none_or_clear_bad(pud))
447 continue;
448 free_pmd_range(tlb, pud, addr, next, floor, ceiling);
449 } while (pud++, addr = next, addr != end);
450
451 start &= PGDIR_MASK;
452 if (start < floor)
453 return;
454 if (ceiling) {
455 ceiling &= PGDIR_MASK;
456 if (!ceiling)
457 return;
458 }
459 if (end - 1 > ceiling - 1)
460 return;
461
462 pud = pud_offset(pgd, start);
463 pgd_clear(pgd);
464 pud_free_tlb(tlb, pud, start);
465}
466
467
468
469
470void free_pgd_range(struct mmu_gather *tlb,
471 unsigned long addr, unsigned long end,
472 unsigned long floor, unsigned long ceiling)
473{
474 pgd_t *pgd;
475 unsigned long next;
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503 addr &= PMD_MASK;
504 if (addr < floor) {
505 addr += PMD_SIZE;
506 if (!addr)
507 return;
508 }
509 if (ceiling) {
510 ceiling &= PMD_MASK;
511 if (!ceiling)
512 return;
513 }
514 if (end - 1 > ceiling - 1)
515 end -= PMD_SIZE;
516 if (addr > end - 1)
517 return;
518
519 pgd = pgd_offset(tlb->mm, addr);
520 do {
521 next = pgd_addr_end(addr, end);
522 if (pgd_none_or_clear_bad(pgd))
523 continue;
524 free_pud_range(tlb, pgd, addr, next, floor, ceiling);
525 } while (pgd++, addr = next, addr != end);
526}
527
528void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
529 unsigned long floor, unsigned long ceiling)
530{
531 while (vma) {
532 struct vm_area_struct *next = vma->vm_next;
533 unsigned long addr = vma->vm_start;
534
535
536
537
538
539 unlink_anon_vmas(vma);
540 unlink_file_vma(vma);
541
542 if (is_vm_hugetlb_page(vma)) {
543 hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
544 floor, next? next->vm_start: ceiling);
545 } else {
546
547
548
549 while (next && next->vm_start <= vma->vm_end + PMD_SIZE
550 && !is_vm_hugetlb_page(next)) {
551 vma = next;
552 next = vma->vm_next;
553 unlink_anon_vmas(vma);
554 unlink_file_vma(vma);
555 }
556 free_pgd_range(tlb, addr, vma->vm_end,
557 floor, next? next->vm_start: ceiling);
558 }
559 vma = next;
560 }
561}
562
563int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
564 pmd_t *pmd, unsigned long address)
565{
566 spinlock_t *ptl;
567 pgtable_t new = pte_alloc_one(mm, address);
568 int wait_split_huge_page;
569 if (!new)
570 return -ENOMEM;
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585 smp_wmb();
586
587 ptl = pmd_lock(mm, pmd);
588 wait_split_huge_page = 0;
589 if (likely(pmd_none(*pmd))) {
590 atomic_long_inc(&mm->nr_ptes);
591 pmd_populate(mm, pmd, new);
592 new = NULL;
593 } else if (unlikely(pmd_trans_splitting(*pmd)))
594 wait_split_huge_page = 1;
595 spin_unlock(ptl);
596 if (new)
597 pte_free(mm, new);
598 if (wait_split_huge_page)
599 wait_split_huge_page(vma->anon_vma, pmd);
600 return 0;
601}
602
603int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
604{
605 pte_t *new = pte_alloc_one_kernel(&init_mm, address);
606 if (!new)
607 return -ENOMEM;
608
609 smp_wmb();
610
611 spin_lock(&init_mm.page_table_lock);
612 if (likely(pmd_none(*pmd))) {
613 pmd_populate_kernel(&init_mm, pmd, new);
614 new = NULL;
615 } else
616 VM_BUG_ON(pmd_trans_splitting(*pmd));
617 spin_unlock(&init_mm.page_table_lock);
618 if (new)
619 pte_free_kernel(&init_mm, new);
620 return 0;
621}
622
623static inline void init_rss_vec(int *rss)
624{
625 memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
626}
627
628static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
629{
630 int i;
631
632 if (current->mm == mm)
633 sync_mm_rss(mm);
634 for (i = 0; i < NR_MM_COUNTERS; i++)
635 if (rss[i])
636 add_mm_counter(mm, i, rss[i]);
637}
638
639
640
641
642
643
644
645
646static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
647 pte_t pte, struct page *page)
648{
649 pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
650 pud_t *pud = pud_offset(pgd, addr);
651 pmd_t *pmd = pmd_offset(pud, addr);
652 struct address_space *mapping;
653 pgoff_t index;
654 static unsigned long resume;
655 static unsigned long nr_shown;
656 static unsigned long nr_unshown;
657
658
659
660
661
662 if (nr_shown == 60) {
663 if (time_before(jiffies, resume)) {
664 nr_unshown++;
665 return;
666 }
667 if (nr_unshown) {
668 printk(KERN_ALERT
669 "BUG: Bad page map: %lu messages suppressed\n",
670 nr_unshown);
671 nr_unshown = 0;
672 }
673 nr_shown = 0;
674 }
675 if (nr_shown++ == 0)
676 resume = jiffies + 60 * HZ;
677
678 mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
679 index = linear_page_index(vma, addr);
680
681 printk(KERN_ALERT
682 "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
683 current->comm,
684 (long long)pte_val(pte), (long long)pmd_val(*pmd));
685 if (page)
686 dump_page(page, "bad pte");
687 printk(KERN_ALERT
688 "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
689 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
690
691
692
693 pr_alert("file:%pD fault:%pf mmap:%pf readpage:%pf\n",
694 vma->vm_file,
695 vma->vm_ops ? vma->vm_ops->fault : NULL,
696 vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
697 mapping ? mapping->a_ops->readpage : NULL);
698 dump_stack();
699 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
700}
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744#ifdef __HAVE_ARCH_PTE_SPECIAL
745# define HAVE_PTE_SPECIAL 1
746#else
747# define HAVE_PTE_SPECIAL 0
748#endif
749struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
750 pte_t pte)
751{
752 unsigned long pfn = pte_pfn(pte);
753
754 if (HAVE_PTE_SPECIAL) {
755 if (likely(!pte_special(pte)))
756 goto check_pfn;
757 if (vma->vm_ops && vma->vm_ops->find_special_page)
758 return vma->vm_ops->find_special_page(vma, addr);
759 if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
760 return NULL;
761 if (!is_zero_pfn(pfn))
762 print_bad_pte(vma, addr, pte, NULL);
763 return NULL;
764 }
765
766
767
768 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
769 if (vma->vm_flags & VM_MIXEDMAP) {
770 if (!pfn_valid(pfn))
771 return NULL;
772 goto out;
773 } else {
774 unsigned long off;
775 off = (addr - vma->vm_start) >> PAGE_SHIFT;
776 if (pfn == vma->vm_pgoff + off)
777 return NULL;
778 if (!is_cow_mapping(vma->vm_flags))
779 return NULL;
780 }
781 }
782
783 if (is_zero_pfn(pfn))
784 return NULL;
785check_pfn:
786 if (unlikely(pfn > highest_memmap_pfn)) {
787 print_bad_pte(vma, addr, pte, NULL);
788 return NULL;
789 }
790
791
792
793
794
795out:
796 return pfn_to_page(pfn);
797}
798
799
800
801
802
803
804
805static inline unsigned long
806copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
807 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
808 unsigned long addr, int *rss)
809{
810 unsigned long vm_flags = vma->vm_flags;
811 pte_t pte = *src_pte;
812 struct page *page;
813
814
815 if (unlikely(!pte_present(pte))) {
816 swp_entry_t entry = pte_to_swp_entry(pte);
817
818 if (likely(!non_swap_entry(entry))) {
819 if (swap_duplicate(entry) < 0)
820 return entry.val;
821
822
823 if (unlikely(list_empty(&dst_mm->mmlist))) {
824 spin_lock(&mmlist_lock);
825 if (list_empty(&dst_mm->mmlist))
826 list_add(&dst_mm->mmlist,
827 &src_mm->mmlist);
828 spin_unlock(&mmlist_lock);
829 }
830 rss[MM_SWAPENTS]++;
831 } else if (is_migration_entry(entry)) {
832 page = migration_entry_to_page(entry);
833
834 if (PageAnon(page))
835 rss[MM_ANONPAGES]++;
836 else
837 rss[MM_FILEPAGES]++;
838
839 if (is_write_migration_entry(entry) &&
840 is_cow_mapping(vm_flags)) {
841
842
843
844
845 make_migration_entry_read(&entry);
846 pte = swp_entry_to_pte(entry);
847 if (pte_swp_soft_dirty(*src_pte))
848 pte = pte_swp_mksoft_dirty(pte);
849 set_pte_at(src_mm, addr, src_pte, pte);
850 }
851 }
852 goto out_set_pte;
853 }
854
855
856
857
858
859 if (is_cow_mapping(vm_flags)) {
860 ptep_set_wrprotect(src_mm, addr, src_pte);
861 pte = pte_wrprotect(pte);
862 }
863
864
865
866
867
868 if (vm_flags & VM_SHARED)
869 pte = pte_mkclean(pte);
870 pte = pte_mkold(pte);
871
872 page = vm_normal_page(vma, addr, pte);
873 if (page) {
874 get_page(page);
875 page_dup_rmap(page);
876 if (PageAnon(page))
877 rss[MM_ANONPAGES]++;
878 else
879 rss[MM_FILEPAGES]++;
880 }
881
882out_set_pte:
883 set_pte_at(dst_mm, addr, dst_pte, pte);
884 return 0;
885}
886
887static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
888 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
889 unsigned long addr, unsigned long end)
890{
891 pte_t *orig_src_pte, *orig_dst_pte;
892 pte_t *src_pte, *dst_pte;
893 spinlock_t *src_ptl, *dst_ptl;
894 int progress = 0;
895 int rss[NR_MM_COUNTERS];
896 swp_entry_t entry = (swp_entry_t){0};
897
898again:
899 init_rss_vec(rss);
900
901 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
902 if (!dst_pte)
903 return -ENOMEM;
904 src_pte = pte_offset_map(src_pmd, addr);
905 src_ptl = pte_lockptr(src_mm, src_pmd);
906 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
907 orig_src_pte = src_pte;
908 orig_dst_pte = dst_pte;
909 arch_enter_lazy_mmu_mode();
910
911 do {
912
913
914
915
916 if (progress >= 32) {
917 progress = 0;
918 if (need_resched() ||
919 spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
920 break;
921 }
922 if (pte_none(*src_pte)) {
923 progress++;
924 continue;
925 }
926 entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
927 vma, addr, rss);
928 if (entry.val)
929 break;
930 progress += 8;
931 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
932
933 arch_leave_lazy_mmu_mode();
934 spin_unlock(src_ptl);
935 pte_unmap(orig_src_pte);
936 add_mm_rss_vec(dst_mm, rss);
937 pte_unmap_unlock(orig_dst_pte, dst_ptl);
938 cond_resched();
939
940 if (entry.val) {
941 if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
942 return -ENOMEM;
943 progress = 0;
944 }
945 if (addr != end)
946 goto again;
947 return 0;
948}
949
950static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
951 pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
952 unsigned long addr, unsigned long end)
953{
954 pmd_t *src_pmd, *dst_pmd;
955 unsigned long next;
956
957 dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
958 if (!dst_pmd)
959 return -ENOMEM;
960 src_pmd = pmd_offset(src_pud, addr);
961 do {
962 next = pmd_addr_end(addr, end);
963 if (pmd_trans_huge(*src_pmd)) {
964 int err;
965 VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);
966 err = copy_huge_pmd(dst_mm, src_mm,
967 dst_pmd, src_pmd, addr, vma);
968 if (err == -ENOMEM)
969 return -ENOMEM;
970 if (!err)
971 continue;
972
973 }
974 if (pmd_none_or_clear_bad(src_pmd))
975 continue;
976 if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
977 vma, addr, next))
978 return -ENOMEM;
979 } while (dst_pmd++, src_pmd++, addr = next, addr != end);
980 return 0;
981}
982
983static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
984 pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
985 unsigned long addr, unsigned long end)
986{
987 pud_t *src_pud, *dst_pud;
988 unsigned long next;
989
990 dst_pud = pud_alloc(dst_mm, dst_pgd, addr);
991 if (!dst_pud)
992 return -ENOMEM;
993 src_pud = pud_offset(src_pgd, addr);
994 do {
995 next = pud_addr_end(addr, end);
996 if (pud_none_or_clear_bad(src_pud))
997 continue;
998 if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
999 vma, addr, next))
1000 return -ENOMEM;
1001 } while (dst_pud++, src_pud++, addr = next, addr != end);
1002 return 0;
1003}
1004
1005int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1006 struct vm_area_struct *vma)
1007{
1008 pgd_t *src_pgd, *dst_pgd;
1009 unsigned long next;
1010 unsigned long addr = vma->vm_start;
1011 unsigned long end = vma->vm_end;
1012 unsigned long mmun_start;
1013 unsigned long mmun_end;
1014 bool is_cow;
1015 int ret;
1016
1017
1018
1019
1020
1021
1022
1023 if (!(vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) &&
1024 !vma->anon_vma)
1025 return 0;
1026
1027 if (is_vm_hugetlb_page(vma))
1028 return copy_hugetlb_page_range(dst_mm, src_mm, vma);
1029
1030 if (unlikely(vma->vm_flags & VM_PFNMAP)) {
1031
1032
1033
1034
1035 ret = track_pfn_copy(vma);
1036 if (ret)
1037 return ret;
1038 }
1039
1040
1041
1042
1043
1044
1045
1046 is_cow = is_cow_mapping(vma->vm_flags);
1047 mmun_start = addr;
1048 mmun_end = end;
1049 if (is_cow)
1050 mmu_notifier_invalidate_range_start(src_mm, mmun_start,
1051 mmun_end);
1052
1053 ret = 0;
1054 dst_pgd = pgd_offset(dst_mm, addr);
1055 src_pgd = pgd_offset(src_mm, addr);
1056 do {
1057 next = pgd_addr_end(addr, end);
1058 if (pgd_none_or_clear_bad(src_pgd))
1059 continue;
1060 if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
1061 vma, addr, next))) {
1062 ret = -ENOMEM;
1063 break;
1064 }
1065 } while (dst_pgd++, src_pgd++, addr = next, addr != end);
1066
1067 if (is_cow)
1068 mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end);
1069 return ret;
1070}
1071
1072static unsigned long zap_pte_range(struct mmu_gather *tlb,
1073 struct vm_area_struct *vma, pmd_t *pmd,
1074 unsigned long addr, unsigned long end,
1075 struct zap_details *details)
1076{
1077 struct mm_struct *mm = tlb->mm;
1078 int force_flush = 0;
1079 int rss[NR_MM_COUNTERS];
1080 spinlock_t *ptl;
1081 pte_t *start_pte;
1082 pte_t *pte;
1083 swp_entry_t entry;
1084
1085again:
1086 init_rss_vec(rss);
1087 start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
1088 pte = start_pte;
1089 arch_enter_lazy_mmu_mode();
1090 do {
1091 pte_t ptent = *pte;
1092 if (pte_none(ptent)) {
1093 continue;
1094 }
1095
1096 if (pte_present(ptent)) {
1097 struct page *page;
1098
1099 page = vm_normal_page(vma, addr, ptent);
1100 if (unlikely(details) && page) {
1101
1102
1103
1104
1105
1106 if (details->check_mapping &&
1107 details->check_mapping != page->mapping)
1108 continue;
1109 }
1110 ptent = ptep_get_and_clear_full(mm, addr, pte,
1111 tlb->fullmm);
1112 tlb_remove_tlb_entry(tlb, pte, addr);
1113 if (unlikely(!page))
1114 continue;
1115 if (PageAnon(page))
1116 rss[MM_ANONPAGES]--;
1117 else {
1118 if (pte_dirty(ptent)) {
1119 force_flush = 1;
1120 set_page_dirty(page);
1121 }
1122 if (pte_young(ptent) &&
1123 likely(!(vma->vm_flags & VM_SEQ_READ)))
1124 mark_page_accessed(page);
1125 rss[MM_FILEPAGES]--;
1126 }
1127 page_remove_rmap(page);
1128 if (unlikely(page_mapcount(page) < 0))
1129 print_bad_pte(vma, addr, ptent, page);
1130 if (unlikely(!__tlb_remove_page(tlb, page))) {
1131 force_flush = 1;
1132 addr += PAGE_SIZE;
1133 break;
1134 }
1135 continue;
1136 }
1137
1138 if (unlikely(details))
1139 continue;
1140
1141 entry = pte_to_swp_entry(ptent);
1142 if (!non_swap_entry(entry))
1143 rss[MM_SWAPENTS]--;
1144 else if (is_migration_entry(entry)) {
1145 struct page *page;
1146
1147 page = migration_entry_to_page(entry);
1148
1149 if (PageAnon(page))
1150 rss[MM_ANONPAGES]--;
1151 else
1152 rss[MM_FILEPAGES]--;
1153 }
1154 if (unlikely(!free_swap_and_cache(entry)))
1155 print_bad_pte(vma, addr, ptent, NULL);
1156 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
1157 } while (pte++, addr += PAGE_SIZE, addr != end);
1158
1159 add_mm_rss_vec(mm, rss);
1160 arch_leave_lazy_mmu_mode();
1161
1162
1163 if (force_flush)
1164 tlb_flush_mmu_tlbonly(tlb);
1165 pte_unmap_unlock(start_pte, ptl);
1166
1167
1168
1169
1170
1171
1172
1173 if (force_flush) {
1174 force_flush = 0;
1175 tlb_flush_mmu_free(tlb);
1176
1177 if (addr != end)
1178 goto again;
1179 }
1180
1181 return addr;
1182}
1183
1184static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1185 struct vm_area_struct *vma, pud_t *pud,
1186 unsigned long addr, unsigned long end,
1187 struct zap_details *details)
1188{
1189 pmd_t *pmd;
1190 unsigned long next;
1191
1192 pmd = pmd_offset(pud, addr);
1193 do {
1194 next = pmd_addr_end(addr, end);
1195 if (pmd_trans_huge(*pmd)) {
1196 if (next - addr != HPAGE_PMD_SIZE) {
1197#ifdef CONFIG_DEBUG_VM
1198 if (!rwsem_is_locked(&tlb->mm->mmap_sem)) {
1199 pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n",
1200 __func__, addr, end,
1201 vma->vm_start,
1202 vma->vm_end);
1203 BUG();
1204 }
1205#endif
1206 split_huge_page_pmd(vma, addr, pmd);
1207 } else if (zap_huge_pmd(tlb, vma, pmd, addr))
1208 goto next;
1209
1210 }
1211
1212
1213
1214
1215
1216
1217
1218 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
1219 goto next;
1220 next = zap_pte_range(tlb, vma, pmd, addr, next, details);
1221next:
1222 cond_resched();
1223 } while (pmd++, addr = next, addr != end);
1224
1225 return addr;
1226}
1227
1228static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1229 struct vm_area_struct *vma, pgd_t *pgd,
1230 unsigned long addr, unsigned long end,
1231 struct zap_details *details)
1232{
1233 pud_t *pud;
1234 unsigned long next;
1235
1236 pud = pud_offset(pgd, addr);
1237 do {
1238 next = pud_addr_end(addr, end);
1239 if (pud_none_or_clear_bad(pud))
1240 continue;
1241 next = zap_pmd_range(tlb, vma, pud, addr, next, details);
1242 } while (pud++, addr = next, addr != end);
1243
1244 return addr;
1245}
1246
1247static void unmap_page_range(struct mmu_gather *tlb,
1248 struct vm_area_struct *vma,
1249 unsigned long addr, unsigned long end,
1250 struct zap_details *details)
1251{
1252 pgd_t *pgd;
1253 unsigned long next;
1254
1255 if (details && !details->check_mapping)
1256 details = NULL;
1257
1258 BUG_ON(addr >= end);
1259 tlb_start_vma(tlb, vma);
1260 pgd = pgd_offset(vma->vm_mm, addr);
1261 do {
1262 next = pgd_addr_end(addr, end);
1263 if (pgd_none_or_clear_bad(pgd))
1264 continue;
1265 next = zap_pud_range(tlb, vma, pgd, addr, next, details);
1266 } while (pgd++, addr = next, addr != end);
1267 tlb_end_vma(tlb, vma);
1268}
1269
1270
1271static void unmap_single_vma(struct mmu_gather *tlb,
1272 struct vm_area_struct *vma, unsigned long start_addr,
1273 unsigned long end_addr,
1274 struct zap_details *details)
1275{
1276 unsigned long start = max(vma->vm_start, start_addr);
1277 unsigned long end;
1278
1279 if (start >= vma->vm_end)
1280 return;
1281 end = min(vma->vm_end, end_addr);
1282 if (end <= vma->vm_start)
1283 return;
1284
1285 if (vma->vm_file)
1286 uprobe_munmap(vma, start, end);
1287
1288 if (unlikely(vma->vm_flags & VM_PFNMAP))
1289 untrack_pfn(vma, 0, 0);
1290
1291 if (start != end) {
1292 if (unlikely(is_vm_hugetlb_page(vma))) {
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304 if (vma->vm_file) {
1305 i_mmap_lock_write(vma->vm_file->f_mapping);
1306 __unmap_hugepage_range_final(tlb, vma, start, end, NULL);
1307 i_mmap_unlock_write(vma->vm_file->f_mapping);
1308 }
1309 } else
1310 unmap_page_range(tlb, vma, start, end, details);
1311 }
1312}
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332void unmap_vmas(struct mmu_gather *tlb,
1333 struct vm_area_struct *vma, unsigned long start_addr,
1334 unsigned long end_addr)
1335{
1336 struct mm_struct *mm = vma->vm_mm;
1337
1338 mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
1339 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
1340 unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
1341 mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
1342}
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353void zap_page_range(struct vm_area_struct *vma, unsigned long start,
1354 unsigned long size, struct zap_details *details)
1355{
1356 struct mm_struct *mm = vma->vm_mm;
1357 struct mmu_gather tlb;
1358 unsigned long end = start + size;
1359
1360 lru_add_drain();
1361 tlb_gather_mmu(&tlb, mm, start, end);
1362 update_hiwater_rss(mm);
1363 mmu_notifier_invalidate_range_start(mm, start, end);
1364 for ( ; vma && vma->vm_start < end; vma = vma->vm_next)
1365 unmap_single_vma(&tlb, vma, start, end, details);
1366 mmu_notifier_invalidate_range_end(mm, start, end);
1367 tlb_finish_mmu(&tlb, start, end);
1368}
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
1380 unsigned long size, struct zap_details *details)
1381{
1382 struct mm_struct *mm = vma->vm_mm;
1383 struct mmu_gather tlb;
1384 unsigned long end = address + size;
1385
1386 lru_add_drain();
1387 tlb_gather_mmu(&tlb, mm, address, end);
1388 update_hiwater_rss(mm);
1389 mmu_notifier_invalidate_range_start(mm, address, end);
1390 unmap_single_vma(&tlb, vma, address, end, details);
1391 mmu_notifier_invalidate_range_end(mm, address, end);
1392 tlb_finish_mmu(&tlb, address, end);
1393}
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
1408 unsigned long size)
1409{
1410 if (address < vma->vm_start || address + size > vma->vm_end ||
1411 !(vma->vm_flags & VM_PFNMAP))
1412 return -1;
1413 zap_page_range_single(vma, address, size, NULL);
1414 return 0;
1415}
1416EXPORT_SYMBOL_GPL(zap_vma_ptes);
1417
1418pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
1419 spinlock_t **ptl)
1420{
1421 pgd_t * pgd = pgd_offset(mm, addr);
1422 pud_t * pud = pud_alloc(mm, pgd, addr);
1423 if (pud) {
1424 pmd_t * pmd = pmd_alloc(mm, pud, addr);
1425 if (pmd) {
1426 VM_BUG_ON(pmd_trans_huge(*pmd));
1427 return pte_alloc_map_lock(mm, pmd, addr, ptl);
1428 }
1429 }
1430 return NULL;
1431}
1432
1433
1434
1435
1436
1437
1438
1439
1440static int insert_page(struct vm_area_struct *vma, unsigned long addr,
1441 struct page *page, pgprot_t prot)
1442{
1443 struct mm_struct *mm = vma->vm_mm;
1444 int retval;
1445 pte_t *pte;
1446 spinlock_t *ptl;
1447
1448 retval = -EINVAL;
1449 if (PageAnon(page))
1450 goto out;
1451 retval = -ENOMEM;
1452 flush_dcache_page(page);
1453 pte = get_locked_pte(mm, addr, &ptl);
1454 if (!pte)
1455 goto out;
1456 retval = -EBUSY;
1457 if (!pte_none(*pte))
1458 goto out_unlock;
1459
1460
1461 get_page(page);
1462 inc_mm_counter_fast(mm, MM_FILEPAGES);
1463 page_add_file_rmap(page);
1464 set_pte_at(mm, addr, pte, mk_pte(page, prot));
1465
1466 retval = 0;
1467 pte_unmap_unlock(pte, ptl);
1468 return retval;
1469out_unlock:
1470 pte_unmap_unlock(pte, ptl);
1471out:
1472 return retval;
1473}
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
1503 struct page *page)
1504{
1505 if (addr < vma->vm_start || addr >= vma->vm_end)
1506 return -EFAULT;
1507 if (!page_count(page))
1508 return -EINVAL;
1509 if (!(vma->vm_flags & VM_MIXEDMAP)) {
1510 BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem));
1511 BUG_ON(vma->vm_flags & VM_PFNMAP);
1512 vma->vm_flags |= VM_MIXEDMAP;
1513 }
1514 return insert_page(vma, addr, page, vma->vm_page_prot);
1515}
1516EXPORT_SYMBOL(vm_insert_page);
1517
1518static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1519 unsigned long pfn, pgprot_t prot)
1520{
1521 struct mm_struct *mm = vma->vm_mm;
1522 int retval;
1523 pte_t *pte, entry;
1524 spinlock_t *ptl;
1525
1526 retval = -ENOMEM;
1527 pte = get_locked_pte(mm, addr, &ptl);
1528 if (!pte)
1529 goto out;
1530 retval = -EBUSY;
1531 if (!pte_none(*pte))
1532 goto out_unlock;
1533
1534
1535 entry = pte_mkspecial(pfn_pte(pfn, prot));
1536 set_pte_at(mm, addr, pte, entry);
1537 update_mmu_cache(vma, addr, pte);
1538
1539 retval = 0;
1540out_unlock:
1541 pte_unmap_unlock(pte, ptl);
1542out:
1543 return retval;
1544}
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1564 unsigned long pfn)
1565{
1566 int ret;
1567 pgprot_t pgprot = vma->vm_page_prot;
1568
1569
1570
1571
1572
1573
1574 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
1575 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
1576 (VM_PFNMAP|VM_MIXEDMAP));
1577 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
1578 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
1579
1580 if (addr < vma->vm_start || addr >= vma->vm_end)
1581 return -EFAULT;
1582 if (track_pfn_insert(vma, &pgprot, pfn))
1583 return -EINVAL;
1584
1585 ret = insert_pfn(vma, addr, pfn, pgprot);
1586
1587 return ret;
1588}
1589EXPORT_SYMBOL(vm_insert_pfn);
1590
1591int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
1592 unsigned long pfn)
1593{
1594 BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
1595
1596 if (addr < vma->vm_start || addr >= vma->vm_end)
1597 return -EFAULT;
1598
1599
1600
1601
1602
1603
1604
1605
1606 if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) {
1607 struct page *page;
1608
1609 page = pfn_to_page(pfn);
1610 return insert_page(vma, addr, page, vma->vm_page_prot);
1611 }
1612 return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
1613}
1614EXPORT_SYMBOL(vm_insert_mixed);
1615
1616
1617
1618
1619
1620
1621static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1622 unsigned long addr, unsigned long end,
1623 unsigned long pfn, pgprot_t prot)
1624{
1625 pte_t *pte;
1626 spinlock_t *ptl;
1627
1628 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
1629 if (!pte)
1630 return -ENOMEM;
1631 arch_enter_lazy_mmu_mode();
1632 do {
1633 BUG_ON(!pte_none(*pte));
1634 set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
1635 pfn++;
1636 } while (pte++, addr += PAGE_SIZE, addr != end);
1637 arch_leave_lazy_mmu_mode();
1638 pte_unmap_unlock(pte - 1, ptl);
1639 return 0;
1640}
1641
1642static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
1643 unsigned long addr, unsigned long end,
1644 unsigned long pfn, pgprot_t prot)
1645{
1646 pmd_t *pmd;
1647 unsigned long next;
1648
1649 pfn -= addr >> PAGE_SHIFT;
1650 pmd = pmd_alloc(mm, pud, addr);
1651 if (!pmd)
1652 return -ENOMEM;
1653 VM_BUG_ON(pmd_trans_huge(*pmd));
1654 do {
1655 next = pmd_addr_end(addr, end);
1656 if (remap_pte_range(mm, pmd, addr, next,
1657 pfn + (addr >> PAGE_SHIFT), prot))
1658 return -ENOMEM;
1659 } while (pmd++, addr = next, addr != end);
1660 return 0;
1661}
1662
1663static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
1664 unsigned long addr, unsigned long end,
1665 unsigned long pfn, pgprot_t prot)
1666{
1667 pud_t *pud;
1668 unsigned long next;
1669
1670 pfn -= addr >> PAGE_SHIFT;
1671 pud = pud_alloc(mm, pgd, addr);
1672 if (!pud)
1673 return -ENOMEM;
1674 do {
1675 next = pud_addr_end(addr, end);
1676 if (remap_pmd_range(mm, pud, addr, next,
1677 pfn + (addr >> PAGE_SHIFT), prot))
1678 return -ENOMEM;
1679 } while (pud++, addr = next, addr != end);
1680 return 0;
1681}
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1694 unsigned long pfn, unsigned long size, pgprot_t prot)
1695{
1696 pgd_t *pgd;
1697 unsigned long next;
1698 unsigned long end = addr + PAGE_ALIGN(size);
1699 struct mm_struct *mm = vma->vm_mm;
1700 int err;
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720 if (is_cow_mapping(vma->vm_flags)) {
1721 if (addr != vma->vm_start || end != vma->vm_end)
1722 return -EINVAL;
1723 vma->vm_pgoff = pfn;
1724 }
1725
1726 err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));
1727 if (err)
1728 return -EINVAL;
1729
1730 vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
1731
1732 BUG_ON(addr >= end);
1733 pfn -= addr >> PAGE_SHIFT;
1734 pgd = pgd_offset(mm, addr);
1735 flush_cache_range(vma, addr, end);
1736 do {
1737 next = pgd_addr_end(addr, end);
1738 err = remap_pud_range(mm, pgd, addr, next,
1739 pfn + (addr >> PAGE_SHIFT), prot);
1740 if (err)
1741 break;
1742 } while (pgd++, addr = next, addr != end);
1743
1744 if (err)
1745 untrack_pfn(vma, pfn, PAGE_ALIGN(size));
1746
1747 return err;
1748}
1749EXPORT_SYMBOL(remap_pfn_range);
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
1765{
1766 unsigned long vm_len, pfn, pages;
1767
1768
1769 if (start + len < start)
1770 return -EINVAL;
1771
1772
1773
1774
1775
1776 len += start & ~PAGE_MASK;
1777 pfn = start >> PAGE_SHIFT;
1778 pages = (len + ~PAGE_MASK) >> PAGE_SHIFT;
1779 if (pfn + pages < pfn)
1780 return -EINVAL;
1781
1782
1783 if (vma->vm_pgoff > pages)
1784 return -EINVAL;
1785 pfn += vma->vm_pgoff;
1786 pages -= vma->vm_pgoff;
1787
1788
1789 vm_len = vma->vm_end - vma->vm_start;
1790 if (vm_len >> PAGE_SHIFT > pages)
1791 return -EINVAL;
1792
1793
1794 return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
1795}
1796EXPORT_SYMBOL(vm_iomap_memory);
1797
1798static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
1799 unsigned long addr, unsigned long end,
1800 pte_fn_t fn, void *data)
1801{
1802 pte_t *pte;
1803 int err;
1804 pgtable_t token;
1805 spinlock_t *uninitialized_var(ptl);
1806
1807 pte = (mm == &init_mm) ?
1808 pte_alloc_kernel(pmd, addr) :
1809 pte_alloc_map_lock(mm, pmd, addr, &ptl);
1810 if (!pte)
1811 return -ENOMEM;
1812
1813 BUG_ON(pmd_huge(*pmd));
1814
1815 arch_enter_lazy_mmu_mode();
1816
1817 token = pmd_pgtable(*pmd);
1818
1819 do {
1820 err = fn(pte++, token, addr, data);
1821 if (err)
1822 break;
1823 } while (addr += PAGE_SIZE, addr != end);
1824
1825 arch_leave_lazy_mmu_mode();
1826
1827 if (mm != &init_mm)
1828 pte_unmap_unlock(pte-1, ptl);
1829 return err;
1830}
1831
1832static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
1833 unsigned long addr, unsigned long end,
1834 pte_fn_t fn, void *data)
1835{
1836 pmd_t *pmd;
1837 unsigned long next;
1838 int err;
1839
1840 BUG_ON(pud_huge(*pud));
1841
1842 pmd = pmd_alloc(mm, pud, addr);
1843 if (!pmd)
1844 return -ENOMEM;
1845 do {
1846 next = pmd_addr_end(addr, end);
1847 err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
1848 if (err)
1849 break;
1850 } while (pmd++, addr = next, addr != end);
1851 return err;
1852}
1853
1854static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd,
1855 unsigned long addr, unsigned long end,
1856 pte_fn_t fn, void *data)
1857{
1858 pud_t *pud;
1859 unsigned long next;
1860 int err;
1861
1862 pud = pud_alloc(mm, pgd, addr);
1863 if (!pud)
1864 return -ENOMEM;
1865 do {
1866 next = pud_addr_end(addr, end);
1867 err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
1868 if (err)
1869 break;
1870 } while (pud++, addr = next, addr != end);
1871 return err;
1872}
1873
1874
1875
1876
1877
1878int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
1879 unsigned long size, pte_fn_t fn, void *data)
1880{
1881 pgd_t *pgd;
1882 unsigned long next;
1883 unsigned long end = addr + size;
1884 int err;
1885
1886 BUG_ON(addr >= end);
1887 pgd = pgd_offset(mm, addr);
1888 do {
1889 next = pgd_addr_end(addr, end);
1890 err = apply_to_pud_range(mm, pgd, addr, next, fn, data);
1891 if (err)
1892 break;
1893 } while (pgd++, addr = next, addr != end);
1894
1895 return err;
1896}
1897EXPORT_SYMBOL_GPL(apply_to_page_range);
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
1908 pte_t *page_table, pte_t orig_pte)
1909{
1910 int same = 1;
1911#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
1912 if (sizeof(pte_t) > sizeof(unsigned long)) {
1913 spinlock_t *ptl = pte_lockptr(mm, pmd);
1914 spin_lock(ptl);
1915 same = pte_same(*page_table, orig_pte);
1916 spin_unlock(ptl);
1917 }
1918#endif
1919 pte_unmap(page_table);
1920 return same;
1921}
1922
1923static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
1924{
1925 debug_dma_assert_idle(src);
1926
1927
1928
1929
1930
1931
1932
1933 if (unlikely(!src)) {
1934 void *kaddr = kmap_atomic(dst);
1935 void __user *uaddr = (void __user *)(va & PAGE_MASK);
1936
1937
1938
1939
1940
1941
1942
1943 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
1944 clear_page(kaddr);
1945 kunmap_atomic(kaddr);
1946 flush_dcache_page(dst);
1947 } else
1948 copy_user_highpage(dst, src, va, vma);
1949}
1950
1951
1952
1953
1954
1955
1956
1957static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
1958 unsigned long address)
1959{
1960 struct vm_fault vmf;
1961 int ret;
1962
1963 vmf.virtual_address = (void __user *)(address & PAGE_MASK);
1964 vmf.pgoff = page->index;
1965 vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
1966 vmf.page = page;
1967 vmf.cow_page = NULL;
1968
1969 ret = vma->vm_ops->page_mkwrite(vma, &vmf);
1970 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
1971 return ret;
1972 if (unlikely(!(ret & VM_FAULT_LOCKED))) {
1973 lock_page(page);
1974 if (!page->mapping) {
1975 unlock_page(page);
1976 return 0;
1977 }
1978 ret |= VM_FAULT_LOCKED;
1979 } else
1980 VM_BUG_ON_PAGE(!PageLocked(page), page);
1981 return ret;
1982}
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992static inline int wp_page_reuse(struct mm_struct *mm,
1993 struct vm_area_struct *vma, unsigned long address,
1994 pte_t *page_table, spinlock_t *ptl, pte_t orig_pte,
1995 struct page *page, int page_mkwrite,
1996 int dirty_shared)
1997 __releases(ptl)
1998{
1999 pte_t entry;
2000
2001
2002
2003
2004
2005 if (page)
2006 page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);
2007
2008 flush_cache_page(vma, address, pte_pfn(orig_pte));
2009 entry = pte_mkyoung(orig_pte);
2010 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2011 if (ptep_set_access_flags(vma, address, page_table, entry, 1))
2012 update_mmu_cache(vma, address, page_table);
2013 pte_unmap_unlock(page_table, ptl);
2014
2015 if (dirty_shared) {
2016 struct address_space *mapping;
2017 int dirtied;
2018
2019 if (!page_mkwrite)
2020 lock_page(page);
2021
2022 dirtied = set_page_dirty(page);
2023 VM_BUG_ON_PAGE(PageAnon(page), page);
2024 mapping = page->mapping;
2025 unlock_page(page);
2026 page_cache_release(page);
2027
2028 if ((dirtied || page_mkwrite) && mapping) {
2029
2030
2031
2032
2033 balance_dirty_pages_ratelimited(mapping);
2034 }
2035
2036 if (!page_mkwrite)
2037 file_update_time(vma->vm_file);
2038 }
2039
2040 return VM_FAULT_WRITE;
2041}
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
2060 unsigned long address, pte_t *page_table, pmd_t *pmd,
2061 pte_t orig_pte, struct page *old_page)
2062{
2063 struct page *new_page = NULL;
2064 spinlock_t *ptl = NULL;
2065 pte_t entry;
2066 int page_copied = 0;
2067 const unsigned long mmun_start = address & PAGE_MASK;
2068 const unsigned long mmun_end = mmun_start + PAGE_SIZE;
2069 struct mem_cgroup *memcg;
2070
2071 if (unlikely(anon_vma_prepare(vma)))
2072 goto oom;
2073
2074 if (is_zero_pfn(pte_pfn(orig_pte))) {
2075 new_page = alloc_zeroed_user_highpage_movable(vma, address);
2076 if (!new_page)
2077 goto oom;
2078 } else {
2079 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
2080 if (!new_page)
2081 goto oom;
2082 cow_user_page(new_page, old_page, address, vma);
2083 }
2084
2085 if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg))
2086 goto oom_free_new;
2087
2088 __SetPageUptodate(new_page);
2089
2090 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2091
2092
2093
2094
2095 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2096 if (likely(pte_same(*page_table, orig_pte))) {
2097 if (old_page) {
2098 if (!PageAnon(old_page)) {
2099 dec_mm_counter_fast(mm, MM_FILEPAGES);
2100 inc_mm_counter_fast(mm, MM_ANONPAGES);
2101 }
2102 } else {
2103 inc_mm_counter_fast(mm, MM_ANONPAGES);
2104 }
2105 flush_cache_page(vma, address, pte_pfn(orig_pte));
2106 entry = mk_pte(new_page, vma->vm_page_prot);
2107 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2108
2109
2110
2111
2112
2113
2114 ptep_clear_flush_notify(vma, address, page_table);
2115 page_add_new_anon_rmap(new_page, vma, address);
2116 mem_cgroup_commit_charge(new_page, memcg, false);
2117 lru_cache_add_active_or_unevictable(new_page, vma);
2118
2119
2120
2121
2122
2123 set_pte_at_notify(mm, address, page_table, entry);
2124 update_mmu_cache(vma, address, page_table);
2125 if (old_page) {
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148 page_remove_rmap(old_page);
2149 }
2150
2151
2152 new_page = old_page;
2153 page_copied = 1;
2154 } else {
2155 mem_cgroup_cancel_charge(new_page, memcg);
2156 }
2157
2158 if (new_page)
2159 page_cache_release(new_page);
2160
2161 pte_unmap_unlock(page_table, ptl);
2162 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2163 if (old_page) {
2164
2165
2166
2167
2168 if (page_copied && (vma->vm_flags & VM_LOCKED)) {
2169 lock_page(old_page);
2170 munlock_vma_page(old_page);
2171 unlock_page(old_page);
2172 }
2173 page_cache_release(old_page);
2174 }
2175 return page_copied ? VM_FAULT_WRITE : 0;
2176oom_free_new:
2177 page_cache_release(new_page);
2178oom:
2179 if (old_page)
2180 page_cache_release(old_page);
2181 return VM_FAULT_OOM;
2182}
2183
2184
2185
2186
2187
2188static int wp_pfn_shared(struct mm_struct *mm,
2189 struct vm_area_struct *vma, unsigned long address,
2190 pte_t *page_table, spinlock_t *ptl, pte_t orig_pte,
2191 pmd_t *pmd)
2192{
2193 if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
2194 struct vm_fault vmf = {
2195 .page = NULL,
2196 .pgoff = linear_page_index(vma, address),
2197 .virtual_address = (void __user *)(address & PAGE_MASK),
2198 .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE,
2199 };
2200 int ret;
2201
2202 pte_unmap_unlock(page_table, ptl);
2203 ret = vma->vm_ops->pfn_mkwrite(vma, &vmf);
2204 if (ret & VM_FAULT_ERROR)
2205 return ret;
2206 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2207
2208
2209
2210
2211 if (!pte_same(*page_table, orig_pte)) {
2212 pte_unmap_unlock(page_table, ptl);
2213 return 0;
2214 }
2215 }
2216 return wp_page_reuse(mm, vma, address, page_table, ptl, orig_pte,
2217 NULL, 0, 0);
2218}
2219
2220static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma,
2221 unsigned long address, pte_t *page_table,
2222 pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte,
2223 struct page *old_page)
2224 __releases(ptl)
2225{
2226 int page_mkwrite = 0;
2227
2228 page_cache_get(old_page);
2229
2230
2231
2232
2233
2234
2235 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
2236 int tmp;
2237
2238 pte_unmap_unlock(page_table, ptl);
2239 tmp = do_page_mkwrite(vma, old_page, address);
2240 if (unlikely(!tmp || (tmp &
2241 (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
2242 page_cache_release(old_page);
2243 return tmp;
2244 }
2245
2246
2247
2248
2249
2250
2251 page_table = pte_offset_map_lock(mm, pmd, address,
2252 &ptl);
2253 if (!pte_same(*page_table, orig_pte)) {
2254 unlock_page(old_page);
2255 pte_unmap_unlock(page_table, ptl);
2256 page_cache_release(old_page);
2257 return 0;
2258 }
2259 page_mkwrite = 1;
2260 }
2261
2262 return wp_page_reuse(mm, vma, address, page_table, ptl,
2263 orig_pte, old_page, page_mkwrite, 1);
2264}
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2285 unsigned long address, pte_t *page_table, pmd_t *pmd,
2286 spinlock_t *ptl, pte_t orig_pte)
2287 __releases(ptl)
2288{
2289 struct page *old_page;
2290
2291 old_page = vm_normal_page(vma, address, orig_pte);
2292 if (!old_page) {
2293
2294
2295
2296
2297
2298
2299
2300 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2301 (VM_WRITE|VM_SHARED))
2302 return wp_pfn_shared(mm, vma, address, page_table, ptl,
2303 orig_pte, pmd);
2304
2305 pte_unmap_unlock(page_table, ptl);
2306 return wp_page_copy(mm, vma, address, page_table, pmd,
2307 orig_pte, old_page);
2308 }
2309
2310
2311
2312
2313
2314 if (PageAnon(old_page) && !PageKsm(old_page)) {
2315 if (!trylock_page(old_page)) {
2316 page_cache_get(old_page);
2317 pte_unmap_unlock(page_table, ptl);
2318 lock_page(old_page);
2319 page_table = pte_offset_map_lock(mm, pmd, address,
2320 &ptl);
2321 if (!pte_same(*page_table, orig_pte)) {
2322 unlock_page(old_page);
2323 pte_unmap_unlock(page_table, ptl);
2324 page_cache_release(old_page);
2325 return 0;
2326 }
2327 page_cache_release(old_page);
2328 }
2329 if (reuse_swap_page(old_page)) {
2330
2331
2332
2333
2334
2335 page_move_anon_rmap(old_page, vma, address);
2336 unlock_page(old_page);
2337 return wp_page_reuse(mm, vma, address, page_table, ptl,
2338 orig_pte, old_page, 0, 0);
2339 }
2340 unlock_page(old_page);
2341 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2342 (VM_WRITE|VM_SHARED))) {
2343 return wp_page_shared(mm, vma, address, page_table, pmd,
2344 ptl, orig_pte, old_page);
2345 }
2346
2347
2348
2349
2350 page_cache_get(old_page);
2351
2352 pte_unmap_unlock(page_table, ptl);
2353 return wp_page_copy(mm, vma, address, page_table, pmd,
2354 orig_pte, old_page);
2355}
2356
2357static void unmap_mapping_range_vma(struct vm_area_struct *vma,
2358 unsigned long start_addr, unsigned long end_addr,
2359 struct zap_details *details)
2360{
2361 zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
2362}
2363
2364static inline void unmap_mapping_range_tree(struct rb_root *root,
2365 struct zap_details *details)
2366{
2367 struct vm_area_struct *vma;
2368 pgoff_t vba, vea, zba, zea;
2369
2370 vma_interval_tree_foreach(vma, root,
2371 details->first_index, details->last_index) {
2372
2373 vba = vma->vm_pgoff;
2374 vea = vba + vma_pages(vma) - 1;
2375
2376 zba = details->first_index;
2377 if (zba < vba)
2378 zba = vba;
2379 zea = details->last_index;
2380 if (zea > vea)
2381 zea = vea;
2382
2383 unmap_mapping_range_vma(vma,
2384 ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
2385 ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
2386 details);
2387 }
2388}
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407void unmap_mapping_range(struct address_space *mapping,
2408 loff_t const holebegin, loff_t const holelen, int even_cows)
2409{
2410 struct zap_details details;
2411 pgoff_t hba = holebegin >> PAGE_SHIFT;
2412 pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2413
2414
2415 if (sizeof(holelen) > sizeof(hlen)) {
2416 long long holeend =
2417 (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2418 if (holeend & ~(long long)ULONG_MAX)
2419 hlen = ULONG_MAX - hba + 1;
2420 }
2421
2422 details.check_mapping = even_cows? NULL: mapping;
2423 details.first_index = hba;
2424 details.last_index = hba + hlen - 1;
2425 if (details.last_index < details.first_index)
2426 details.last_index = ULONG_MAX;
2427
2428
2429
2430 i_mmap_lock_write(mapping);
2431 if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
2432 unmap_mapping_range_tree(&mapping->i_mmap, &details);
2433 i_mmap_unlock_write(mapping);
2434}
2435EXPORT_SYMBOL(unmap_mapping_range);
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2446 unsigned long address, pte_t *page_table, pmd_t *pmd,
2447 unsigned int flags, pte_t orig_pte)
2448{
2449 spinlock_t *ptl;
2450 struct page *page, *swapcache;
2451 struct mem_cgroup *memcg;
2452 swp_entry_t entry;
2453 pte_t pte;
2454 int locked;
2455 int exclusive = 0;
2456 int ret = 0;
2457
2458 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
2459 goto out;
2460
2461 entry = pte_to_swp_entry(orig_pte);
2462 if (unlikely(non_swap_entry(entry))) {
2463 if (is_migration_entry(entry)) {
2464 migration_entry_wait(mm, pmd, address);
2465 } else if (is_hwpoison_entry(entry)) {
2466 ret = VM_FAULT_HWPOISON;
2467 } else {
2468 print_bad_pte(vma, address, orig_pte, NULL);
2469 ret = VM_FAULT_SIGBUS;
2470 }
2471 goto out;
2472 }
2473 delayacct_set_flag(DELAYACCT_PF_SWAPIN);
2474 page = lookup_swap_cache(entry);
2475 if (!page) {
2476 page = swapin_readahead(entry,
2477 GFP_HIGHUSER_MOVABLE, vma, address);
2478 if (!page) {
2479
2480
2481
2482
2483 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2484 if (likely(pte_same(*page_table, orig_pte)))
2485 ret = VM_FAULT_OOM;
2486 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2487 goto unlock;
2488 }
2489
2490
2491 ret = VM_FAULT_MAJOR;
2492 count_vm_event(PGMAJFAULT);
2493 mem_cgroup_count_vm_event(mm, PGMAJFAULT);
2494 } else if (PageHWPoison(page)) {
2495
2496
2497
2498
2499 ret = VM_FAULT_HWPOISON;
2500 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2501 swapcache = page;
2502 goto out_release;
2503 }
2504
2505 swapcache = page;
2506 locked = lock_page_or_retry(page, mm, flags);
2507
2508 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2509 if (!locked) {
2510 ret |= VM_FAULT_RETRY;
2511 goto out_release;
2512 }
2513
2514
2515
2516
2517
2518
2519
2520 if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
2521 goto out_page;
2522
2523 page = ksm_might_need_to_copy(page, vma, address);
2524 if (unlikely(!page)) {
2525 ret = VM_FAULT_OOM;
2526 page = swapcache;
2527 goto out_page;
2528 }
2529
2530 if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) {
2531 ret = VM_FAULT_OOM;
2532 goto out_page;
2533 }
2534
2535
2536
2537
2538 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2539 if (unlikely(!pte_same(*page_table, orig_pte)))
2540 goto out_nomap;
2541
2542 if (unlikely(!PageUptodate(page))) {
2543 ret = VM_FAULT_SIGBUS;
2544 goto out_nomap;
2545 }
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557 inc_mm_counter_fast(mm, MM_ANONPAGES);
2558 dec_mm_counter_fast(mm, MM_SWAPENTS);
2559 pte = mk_pte(page, vma->vm_page_prot);
2560 if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
2561 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
2562 flags &= ~FAULT_FLAG_WRITE;
2563 ret |= VM_FAULT_WRITE;
2564 exclusive = 1;
2565 }
2566 flush_icache_page(vma, page);
2567 if (pte_swp_soft_dirty(orig_pte))
2568 pte = pte_mksoft_dirty(pte);
2569 set_pte_at(mm, address, page_table, pte);
2570 if (page == swapcache) {
2571 do_page_add_anon_rmap(page, vma, address, exclusive);
2572 mem_cgroup_commit_charge(page, memcg, true);
2573 } else {
2574 page_add_new_anon_rmap(page, vma, address);
2575 mem_cgroup_commit_charge(page, memcg, false);
2576 lru_cache_add_active_or_unevictable(page, vma);
2577 }
2578
2579 swap_free(entry);
2580 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
2581 try_to_free_swap(page);
2582 unlock_page(page);
2583 if (page != swapcache) {
2584
2585
2586
2587
2588
2589
2590
2591
2592 unlock_page(swapcache);
2593 page_cache_release(swapcache);
2594 }
2595
2596 if (flags & FAULT_FLAG_WRITE) {
2597 ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
2598 if (ret & VM_FAULT_ERROR)
2599 ret &= VM_FAULT_ERROR;
2600 goto out;
2601 }
2602
2603
2604 update_mmu_cache(vma, address, page_table);
2605unlock:
2606 pte_unmap_unlock(page_table, ptl);
2607out:
2608 return ret;
2609out_nomap:
2610 mem_cgroup_cancel_charge(page, memcg);
2611 pte_unmap_unlock(page_table, ptl);
2612out_page:
2613 unlock_page(page);
2614out_release:
2615 page_cache_release(page);
2616 if (page != swapcache) {
2617 unlock_page(swapcache);
2618 page_cache_release(swapcache);
2619 }
2620 return ret;
2621}
2622
2623
2624
2625
2626
2627
2628static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address)
2629{
2630 address &= PAGE_MASK;
2631 if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) {
2632 struct vm_area_struct *prev = vma->vm_prev;
2633
2634
2635
2636
2637
2638
2639
2640 if (prev && prev->vm_end == address)
2641 return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM;
2642
2643 return expand_downwards(vma, address - PAGE_SIZE);
2644 }
2645 if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) {
2646 struct vm_area_struct *next = vma->vm_next;
2647
2648
2649 if (next && next->vm_start == address + PAGE_SIZE)
2650 return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM;
2651
2652 return expand_upwards(vma, address + PAGE_SIZE);
2653 }
2654 return 0;
2655}
2656
2657
2658
2659
2660
2661
2662static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2663 unsigned long address, pte_t *page_table, pmd_t *pmd,
2664 unsigned int flags)
2665{
2666 struct mem_cgroup *memcg;
2667 struct page *page;
2668 spinlock_t *ptl;
2669 pte_t entry;
2670
2671 pte_unmap(page_table);
2672
2673
2674 if (vma->vm_flags & VM_SHARED)
2675 return VM_FAULT_SIGBUS;
2676
2677
2678 if (check_stack_guard_page(vma, address) < 0)
2679 return VM_FAULT_SIGSEGV;
2680
2681
2682 if (!(flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(mm)) {
2683 entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
2684 vma->vm_page_prot));
2685 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2686 if (!pte_none(*page_table))
2687 goto unlock;
2688 goto setpte;
2689 }
2690
2691
2692 if (unlikely(anon_vma_prepare(vma)))
2693 goto oom;
2694 page = alloc_zeroed_user_highpage_movable(vma, address);
2695 if (!page)
2696 goto oom;
2697
2698 if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg))
2699 goto oom_free_page;
2700
2701
2702
2703
2704
2705
2706 __SetPageUptodate(page);
2707
2708 entry = mk_pte(page, vma->vm_page_prot);
2709 if (vma->vm_flags & VM_WRITE)
2710 entry = pte_mkwrite(pte_mkdirty(entry));
2711
2712 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2713 if (!pte_none(*page_table))
2714 goto release;
2715
2716 inc_mm_counter_fast(mm, MM_ANONPAGES);
2717 page_add_new_anon_rmap(page, vma, address);
2718 mem_cgroup_commit_charge(page, memcg, false);
2719 lru_cache_add_active_or_unevictable(page, vma);
2720setpte:
2721 set_pte_at(mm, address, page_table, entry);
2722
2723
2724 update_mmu_cache(vma, address, page_table);
2725unlock:
2726 pte_unmap_unlock(page_table, ptl);
2727 return 0;
2728release:
2729 mem_cgroup_cancel_charge(page, memcg);
2730 page_cache_release(page);
2731 goto unlock;
2732oom_free_page:
2733 page_cache_release(page);
2734oom:
2735 return VM_FAULT_OOM;
2736}
2737
2738
2739
2740
2741
2742
2743static int __do_fault(struct vm_area_struct *vma, unsigned long address,
2744 pgoff_t pgoff, unsigned int flags,
2745 struct page *cow_page, struct page **page)
2746{
2747 struct vm_fault vmf;
2748 int ret;
2749
2750 vmf.virtual_address = (void __user *)(address & PAGE_MASK);
2751 vmf.pgoff = pgoff;
2752 vmf.flags = flags;
2753 vmf.page = NULL;
2754 vmf.cow_page = cow_page;
2755
2756 ret = vma->vm_ops->fault(vma, &vmf);
2757 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
2758 return ret;
2759 if (!vmf.page)
2760 goto out;
2761
2762 if (unlikely(PageHWPoison(vmf.page))) {
2763 if (ret & VM_FAULT_LOCKED)
2764 unlock_page(vmf.page);
2765 page_cache_release(vmf.page);
2766 return VM_FAULT_HWPOISON;
2767 }
2768
2769 if (unlikely(!(ret & VM_FAULT_LOCKED)))
2770 lock_page(vmf.page);
2771 else
2772 VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page);
2773
2774 out:
2775 *page = vmf.page;
2776 return ret;
2777}
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794void do_set_pte(struct vm_area_struct *vma, unsigned long address,
2795 struct page *page, pte_t *pte, bool write, bool anon)
2796{
2797 pte_t entry;
2798
2799 flush_icache_page(vma, page);
2800 entry = mk_pte(page, vma->vm_page_prot);
2801 if (write)
2802 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2803 if (anon) {
2804 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
2805 page_add_new_anon_rmap(page, vma, address);
2806 } else {
2807 inc_mm_counter_fast(vma->vm_mm, MM_FILEPAGES);
2808 page_add_file_rmap(page);
2809 }
2810 set_pte_at(vma->vm_mm, address, pte, entry);
2811
2812
2813 update_mmu_cache(vma, address, pte);
2814}
2815
2816static unsigned long fault_around_bytes __read_mostly =
2817 rounddown_pow_of_two(65536);
2818
2819#ifdef CONFIG_DEBUG_FS
2820static int fault_around_bytes_get(void *data, u64 *val)
2821{
2822 *val = fault_around_bytes;
2823 return 0;
2824}
2825
2826
2827
2828
2829
2830
2831static int fault_around_bytes_set(void *data, u64 val)
2832{
2833 if (val / PAGE_SIZE > PTRS_PER_PTE)
2834 return -EINVAL;
2835 if (val > PAGE_SIZE)
2836 fault_around_bytes = rounddown_pow_of_two(val);
2837 else
2838 fault_around_bytes = PAGE_SIZE;
2839 return 0;
2840}
2841DEFINE_SIMPLE_ATTRIBUTE(fault_around_bytes_fops,
2842 fault_around_bytes_get, fault_around_bytes_set, "%llu\n");
2843
2844static int __init fault_around_debugfs(void)
2845{
2846 void *ret;
2847
2848 ret = debugfs_create_file("fault_around_bytes", 0644, NULL, NULL,
2849 &fault_around_bytes_fops);
2850 if (!ret)
2851 pr_warn("Failed to create fault_around_bytes in debugfs");
2852 return 0;
2853}
2854late_initcall(fault_around_debugfs);
2855#endif
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880static void do_fault_around(struct vm_area_struct *vma, unsigned long address,
2881 pte_t *pte, pgoff_t pgoff, unsigned int flags)
2882{
2883 unsigned long start_addr, nr_pages, mask;
2884 pgoff_t max_pgoff;
2885 struct vm_fault vmf;
2886 int off;
2887
2888 nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
2889 mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
2890
2891 start_addr = max(address & mask, vma->vm_start);
2892 off = ((address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
2893 pte -= off;
2894 pgoff -= off;
2895
2896
2897
2898
2899
2900 max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
2901 PTRS_PER_PTE - 1;
2902 max_pgoff = min3(max_pgoff, vma_pages(vma) + vma->vm_pgoff - 1,
2903 pgoff + nr_pages - 1);
2904
2905
2906 while (!pte_none(*pte)) {
2907 if (++pgoff > max_pgoff)
2908 return;
2909 start_addr += PAGE_SIZE;
2910 if (start_addr >= vma->vm_end)
2911 return;
2912 pte++;
2913 }
2914
2915 vmf.virtual_address = (void __user *) start_addr;
2916 vmf.pte = pte;
2917 vmf.pgoff = pgoff;
2918 vmf.max_pgoff = max_pgoff;
2919 vmf.flags = flags;
2920 vma->vm_ops->map_pages(vma, &vmf);
2921}
2922
2923static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2924 unsigned long address, pmd_t *pmd,
2925 pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
2926{
2927 struct page *fault_page;
2928 spinlock_t *ptl;
2929 pte_t *pte;
2930 int ret = 0;
2931
2932
2933
2934
2935
2936
2937 if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
2938 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
2939 do_fault_around(vma, address, pte, pgoff, flags);
2940 if (!pte_same(*pte, orig_pte))
2941 goto unlock_out;
2942 pte_unmap_unlock(pte, ptl);
2943 }
2944
2945 ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);
2946 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
2947 return ret;
2948
2949 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
2950 if (unlikely(!pte_same(*pte, orig_pte))) {
2951 pte_unmap_unlock(pte, ptl);
2952 unlock_page(fault_page);
2953 page_cache_release(fault_page);
2954 return ret;
2955 }
2956 do_set_pte(vma, address, fault_page, pte, false, false);
2957 unlock_page(fault_page);
2958unlock_out:
2959 pte_unmap_unlock(pte, ptl);
2960 return ret;
2961}
2962
2963static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2964 unsigned long address, pmd_t *pmd,
2965 pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
2966{
2967 struct page *fault_page, *new_page;
2968 struct mem_cgroup *memcg;
2969 spinlock_t *ptl;
2970 pte_t *pte;
2971 int ret;
2972
2973 if (unlikely(anon_vma_prepare(vma)))
2974 return VM_FAULT_OOM;
2975
2976 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
2977 if (!new_page)
2978 return VM_FAULT_OOM;
2979
2980 if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) {
2981 page_cache_release(new_page);
2982 return VM_FAULT_OOM;
2983 }
2984
2985 ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page);
2986 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
2987 goto uncharge_out;
2988
2989 if (fault_page)
2990 copy_user_highpage(new_page, fault_page, address, vma);
2991 __SetPageUptodate(new_page);
2992
2993 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
2994 if (unlikely(!pte_same(*pte, orig_pte))) {
2995 pte_unmap_unlock(pte, ptl);
2996 if (fault_page) {
2997 unlock_page(fault_page);
2998 page_cache_release(fault_page);
2999 } else {
3000
3001
3002
3003
3004 i_mmap_unlock_read(vma->vm_file->f_mapping);
3005 }
3006 goto uncharge_out;
3007 }
3008 do_set_pte(vma, address, new_page, pte, true, true);
3009 mem_cgroup_commit_charge(new_page, memcg, false);
3010 lru_cache_add_active_or_unevictable(new_page, vma);
3011 pte_unmap_unlock(pte, ptl);
3012 if (fault_page) {
3013 unlock_page(fault_page);
3014 page_cache_release(fault_page);
3015 } else {
3016
3017
3018
3019
3020 i_mmap_unlock_read(vma->vm_file->f_mapping);
3021 }
3022 return ret;
3023uncharge_out:
3024 mem_cgroup_cancel_charge(new_page, memcg);
3025 page_cache_release(new_page);
3026 return ret;
3027}
3028
3029static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3030 unsigned long address, pmd_t *pmd,
3031 pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
3032{
3033 struct page *fault_page;
3034 struct address_space *mapping;
3035 spinlock_t *ptl;
3036 pte_t *pte;
3037 int dirtied = 0;
3038 int ret, tmp;
3039
3040 ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);
3041 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3042 return ret;
3043
3044
3045
3046
3047
3048 if (vma->vm_ops->page_mkwrite) {
3049 unlock_page(fault_page);
3050 tmp = do_page_mkwrite(vma, fault_page, address);
3051 if (unlikely(!tmp ||
3052 (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
3053 page_cache_release(fault_page);
3054 return tmp;
3055 }
3056 }
3057
3058 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
3059 if (unlikely(!pte_same(*pte, orig_pte))) {
3060 pte_unmap_unlock(pte, ptl);
3061 unlock_page(fault_page);
3062 page_cache_release(fault_page);
3063 return ret;
3064 }
3065 do_set_pte(vma, address, fault_page, pte, true, false);
3066 pte_unmap_unlock(pte, ptl);
3067
3068 if (set_page_dirty(fault_page))
3069 dirtied = 1;
3070
3071
3072
3073
3074
3075
3076 mapping = fault_page->mapping;
3077 unlock_page(fault_page);
3078 if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) {
3079
3080
3081
3082
3083 balance_dirty_pages_ratelimited(mapping);
3084 }
3085
3086 if (!vma->vm_ops->page_mkwrite)
3087 file_update_time(vma->vm_file);
3088
3089 return ret;
3090}
3091
3092
3093
3094
3095
3096
3097
3098static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3099 unsigned long address, pte_t *page_table, pmd_t *pmd,
3100 unsigned int flags, pte_t orig_pte)
3101{
3102 pgoff_t pgoff = (((address & PAGE_MASK)
3103 - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
3104
3105 pte_unmap(page_table);
3106
3107 if (!vma->vm_ops->fault)
3108 return VM_FAULT_SIGBUS;
3109 if (!(flags & FAULT_FLAG_WRITE))
3110 return do_read_fault(mm, vma, address, pmd, pgoff, flags,
3111 orig_pte);
3112 if (!(vma->vm_flags & VM_SHARED))
3113 return do_cow_fault(mm, vma, address, pmd, pgoff, flags,
3114 orig_pte);
3115 return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
3116}
3117
3118static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
3119 unsigned long addr, int page_nid,
3120 int *flags)
3121{
3122 get_page(page);
3123
3124 count_vm_numa_event(NUMA_HINT_FAULTS);
3125 if (page_nid == numa_node_id()) {
3126 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
3127 *flags |= TNF_FAULT_LOCAL;
3128 }
3129
3130 return mpol_misplaced(page, vma, addr);
3131}
3132
3133static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3134 unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd)
3135{
3136 struct page *page = NULL;
3137 spinlock_t *ptl;
3138 int page_nid = -1;
3139 int last_cpupid;
3140 int target_nid;
3141 bool migrated = false;
3142 bool was_writable = pte_write(pte);
3143 int flags = 0;
3144
3145
3146 BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)));
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157 ptl = pte_lockptr(mm, pmd);
3158 spin_lock(ptl);
3159 if (unlikely(!pte_same(*ptep, pte))) {
3160 pte_unmap_unlock(ptep, ptl);
3161 goto out;
3162 }
3163
3164
3165 pte = pte_modify(pte, vma->vm_page_prot);
3166 pte = pte_mkyoung(pte);
3167 if (was_writable)
3168 pte = pte_mkwrite(pte);
3169 set_pte_at(mm, addr, ptep, pte);
3170 update_mmu_cache(vma, addr, ptep);
3171
3172 page = vm_normal_page(vma, addr, pte);
3173 if (!page) {
3174 pte_unmap_unlock(ptep, ptl);
3175 return 0;
3176 }
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186 if (!(vma->vm_flags & VM_WRITE))
3187 flags |= TNF_NO_GROUP;
3188
3189
3190
3191
3192
3193 if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
3194 flags |= TNF_SHARED;
3195
3196 last_cpupid = page_cpupid_last(page);
3197 page_nid = page_to_nid(page);
3198 target_nid = numa_migrate_prep(page, vma, addr, page_nid, &flags);
3199 pte_unmap_unlock(ptep, ptl);
3200 if (target_nid == -1) {
3201 put_page(page);
3202 goto out;
3203 }
3204
3205
3206 migrated = migrate_misplaced_page(page, vma, target_nid);
3207 if (migrated) {
3208 page_nid = target_nid;
3209 flags |= TNF_MIGRATED;
3210 } else
3211 flags |= TNF_MIGRATE_FAIL;
3212
3213out:
3214 if (page_nid != -1)
3215 task_numa_fault(last_cpupid, page_nid, 1, flags);
3216 return 0;
3217}
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235static int handle_pte_fault(struct mm_struct *mm,
3236 struct vm_area_struct *vma, unsigned long address,
3237 pte_t *pte, pmd_t *pmd, unsigned int flags)
3238{
3239 pte_t entry;
3240 spinlock_t *ptl;
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250 entry = *pte;
3251 barrier();
3252 if (!pte_present(entry)) {
3253 if (pte_none(entry)) {
3254 if (vma->vm_ops)
3255 return do_fault(mm, vma, address, pte, pmd,
3256 flags, entry);
3257
3258 return do_anonymous_page(mm, vma, address, pte, pmd,
3259 flags);
3260 }
3261 return do_swap_page(mm, vma, address,
3262 pte, pmd, flags, entry);
3263 }
3264
3265 if (pte_protnone(entry))
3266 return do_numa_page(mm, vma, address, entry, pte, pmd);
3267
3268 ptl = pte_lockptr(mm, pmd);
3269 spin_lock(ptl);
3270 if (unlikely(!pte_same(*pte, entry)))
3271 goto unlock;
3272 if (flags & FAULT_FLAG_WRITE) {
3273 if (!pte_write(entry))
3274 return do_wp_page(mm, vma, address,
3275 pte, pmd, ptl, entry);
3276 entry = pte_mkdirty(entry);
3277 }
3278 entry = pte_mkyoung(entry);
3279 if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
3280 update_mmu_cache(vma, address, pte);
3281 } else {
3282
3283
3284
3285
3286
3287
3288 if (flags & FAULT_FLAG_WRITE)
3289 flush_tlb_fix_spurious_fault(vma, address);
3290 }
3291unlock:
3292 pte_unmap_unlock(pte, ptl);
3293 return 0;
3294}
3295
3296
3297
3298
3299
3300
3301
3302static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3303 unsigned long address, unsigned int flags)
3304{
3305 pgd_t *pgd;
3306 pud_t *pud;
3307 pmd_t *pmd;
3308 pte_t *pte;
3309
3310 if (unlikely(is_vm_hugetlb_page(vma)))
3311 return hugetlb_fault(mm, vma, address, flags);
3312
3313 pgd = pgd_offset(mm, address);
3314 pud = pud_alloc(mm, pgd, address);
3315 if (!pud)
3316 return VM_FAULT_OOM;
3317 pmd = pmd_alloc(mm, pud, address);
3318 if (!pmd)
3319 return VM_FAULT_OOM;
3320 if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
3321 int ret = VM_FAULT_FALLBACK;
3322 if (!vma->vm_ops)
3323 ret = do_huge_pmd_anonymous_page(mm, vma, address,
3324 pmd, flags);
3325 if (!(ret & VM_FAULT_FALLBACK))
3326 return ret;
3327 } else {
3328 pmd_t orig_pmd = *pmd;
3329 int ret;
3330
3331 barrier();
3332 if (pmd_trans_huge(orig_pmd)) {
3333 unsigned int dirty = flags & FAULT_FLAG_WRITE;
3334
3335
3336
3337
3338
3339
3340 if (pmd_trans_splitting(orig_pmd))
3341 return 0;
3342
3343 if (pmd_protnone(orig_pmd))
3344 return do_huge_pmd_numa_page(mm, vma, address,
3345 orig_pmd, pmd);
3346
3347 if (dirty && !pmd_write(orig_pmd)) {
3348 ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
3349 orig_pmd);
3350 if (!(ret & VM_FAULT_FALLBACK))
3351 return ret;
3352 } else {
3353 huge_pmd_set_accessed(mm, vma, address, pmd,
3354 orig_pmd, dirty);
3355 return 0;
3356 }
3357 }
3358 }
3359
3360
3361
3362
3363
3364
3365 if (unlikely(pmd_none(*pmd)) &&
3366 unlikely(__pte_alloc(mm, vma, pmd, address)))
3367 return VM_FAULT_OOM;
3368
3369 if (unlikely(pmd_trans_huge(*pmd)))
3370 return 0;
3371
3372
3373
3374
3375
3376
3377 pte = pte_offset_map(pmd, address);
3378
3379 return handle_pte_fault(mm, vma, address, pte, pmd, flags);
3380}
3381
3382
3383
3384
3385
3386
3387
3388int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3389 unsigned long address, unsigned int flags)
3390{
3391 int ret;
3392
3393 __set_current_state(TASK_RUNNING);
3394
3395 count_vm_event(PGFAULT);
3396 mem_cgroup_count_vm_event(mm, PGFAULT);
3397
3398
3399 check_sync_rss_stat(current);
3400
3401
3402
3403
3404
3405 if (flags & FAULT_FLAG_USER)
3406 mem_cgroup_oom_enable();
3407
3408 ret = __handle_mm_fault(mm, vma, address, flags);
3409
3410 if (flags & FAULT_FLAG_USER) {
3411 mem_cgroup_oom_disable();
3412
3413
3414
3415
3416
3417
3418 if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
3419 mem_cgroup_oom_synchronize(false);
3420 }
3421
3422 return ret;
3423}
3424EXPORT_SYMBOL_GPL(handle_mm_fault);
3425
3426#ifndef __PAGETABLE_PUD_FOLDED
3427
3428
3429
3430
3431int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
3432{
3433 pud_t *new = pud_alloc_one(mm, address);
3434 if (!new)
3435 return -ENOMEM;
3436
3437 smp_wmb();
3438
3439 spin_lock(&mm->page_table_lock);
3440 if (pgd_present(*pgd))
3441 pud_free(mm, new);
3442 else
3443 pgd_populate(mm, pgd, new);
3444 spin_unlock(&mm->page_table_lock);
3445 return 0;
3446}
3447#endif
3448
3449#ifndef __PAGETABLE_PMD_FOLDED
3450
3451
3452
3453
3454int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
3455{
3456 pmd_t *new = pmd_alloc_one(mm, address);
3457 if (!new)
3458 return -ENOMEM;
3459
3460 smp_wmb();
3461
3462 spin_lock(&mm->page_table_lock);
3463#ifndef __ARCH_HAS_4LEVEL_HACK
3464 if (!pud_present(*pud)) {
3465 mm_inc_nr_pmds(mm);
3466 pud_populate(mm, pud, new);
3467 } else
3468 pmd_free(mm, new);
3469#else
3470 if (!pgd_present(*pud)) {
3471 mm_inc_nr_pmds(mm);
3472 pgd_populate(mm, pud, new);
3473 } else
3474 pmd_free(mm, new);
3475#endif
3476 spin_unlock(&mm->page_table_lock);
3477 return 0;
3478}
3479#endif
3480
3481static int __follow_pte(struct mm_struct *mm, unsigned long address,
3482 pte_t **ptepp, spinlock_t **ptlp)
3483{
3484 pgd_t *pgd;
3485 pud_t *pud;
3486 pmd_t *pmd;
3487 pte_t *ptep;
3488
3489 pgd = pgd_offset(mm, address);
3490 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
3491 goto out;
3492
3493 pud = pud_offset(pgd, address);
3494 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
3495 goto out;
3496
3497 pmd = pmd_offset(pud, address);
3498 VM_BUG_ON(pmd_trans_huge(*pmd));
3499 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
3500 goto out;
3501
3502
3503 if (pmd_huge(*pmd))
3504 goto out;
3505
3506 ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
3507 if (!ptep)
3508 goto out;
3509 if (!pte_present(*ptep))
3510 goto unlock;
3511 *ptepp = ptep;
3512 return 0;
3513unlock:
3514 pte_unmap_unlock(ptep, *ptlp);
3515out:
3516 return -EINVAL;
3517}
3518
3519static inline int follow_pte(struct mm_struct *mm, unsigned long address,
3520 pte_t **ptepp, spinlock_t **ptlp)
3521{
3522 int res;
3523
3524
3525 (void) __cond_lock(*ptlp,
3526 !(res = __follow_pte(mm, address, ptepp, ptlp)));
3527 return res;
3528}
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540int follow_pfn(struct vm_area_struct *vma, unsigned long address,
3541 unsigned long *pfn)
3542{
3543 int ret = -EINVAL;
3544 spinlock_t *ptl;
3545 pte_t *ptep;
3546
3547 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
3548 return ret;
3549
3550 ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
3551 if (ret)
3552 return ret;
3553 *pfn = pte_pfn(*ptep);
3554 pte_unmap_unlock(ptep, ptl);
3555 return 0;
3556}
3557EXPORT_SYMBOL(follow_pfn);
3558
3559#ifdef CONFIG_HAVE_IOREMAP_PROT
3560int follow_phys(struct vm_area_struct *vma,
3561 unsigned long address, unsigned int flags,
3562 unsigned long *prot, resource_size_t *phys)
3563{
3564 int ret = -EINVAL;
3565 pte_t *ptep, pte;
3566 spinlock_t *ptl;
3567
3568 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
3569 goto out;
3570
3571 if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
3572 goto out;
3573 pte = *ptep;
3574
3575 if ((flags & FOLL_WRITE) && !pte_write(pte))
3576 goto unlock;
3577
3578 *prot = pgprot_val(pte_pgprot(pte));
3579 *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
3580
3581 ret = 0;
3582unlock:
3583 pte_unmap_unlock(ptep, ptl);
3584out:
3585 return ret;
3586}
3587
3588int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
3589 void *buf, int len, int write)
3590{
3591 resource_size_t phys_addr;
3592 unsigned long prot = 0;
3593 void __iomem *maddr;
3594 int offset = addr & (PAGE_SIZE-1);
3595
3596 if (follow_phys(vma, addr, write, &prot, &phys_addr))
3597 return -EINVAL;
3598
3599 maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
3600 if (write)
3601 memcpy_toio(maddr + offset, buf, len);
3602 else
3603 memcpy_fromio(buf, maddr + offset, len);
3604 iounmap(maddr);
3605
3606 return len;
3607}
3608EXPORT_SYMBOL_GPL(generic_access_phys);
3609#endif
3610
3611
3612
3613
3614
3615static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
3616 unsigned long addr, void *buf, int len, int write)
3617{
3618 struct vm_area_struct *vma;
3619 void *old_buf = buf;
3620
3621 down_read(&mm->mmap_sem);
3622
3623 while (len) {
3624 int bytes, ret, offset;
3625 void *maddr;
3626 struct page *page = NULL;
3627
3628 ret = get_user_pages(tsk, mm, addr, 1,
3629 write, 1, &page, &vma);
3630 if (ret <= 0) {
3631#ifndef CONFIG_HAVE_IOREMAP_PROT
3632 break;
3633#else
3634
3635
3636
3637
3638 vma = find_vma(mm, addr);
3639 if (!vma || vma->vm_start > addr)
3640 break;
3641 if (vma->vm_ops && vma->vm_ops->access)
3642 ret = vma->vm_ops->access(vma, addr, buf,
3643 len, write);
3644 if (ret <= 0)
3645 break;
3646 bytes = ret;
3647#endif
3648 } else {
3649 bytes = len;
3650 offset = addr & (PAGE_SIZE-1);
3651 if (bytes > PAGE_SIZE-offset)
3652 bytes = PAGE_SIZE-offset;
3653
3654 maddr = kmap(page);
3655 if (write) {
3656 copy_to_user_page(vma, page, addr,
3657 maddr + offset, buf, bytes);
3658 set_page_dirty_lock(page);
3659 } else {
3660 copy_from_user_page(vma, page, addr,
3661 buf, maddr + offset, bytes);
3662 }
3663 kunmap(page);
3664 page_cache_release(page);
3665 }
3666 len -= bytes;
3667 buf += bytes;
3668 addr += bytes;
3669 }
3670 up_read(&mm->mmap_sem);
3671
3672 return buf - old_buf;
3673}
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685int access_remote_vm(struct mm_struct *mm, unsigned long addr,
3686 void *buf, int len, int write)
3687{
3688 return __access_remote_vm(NULL, mm, addr, buf, len, write);
3689}
3690
3691
3692
3693
3694
3695
3696int access_process_vm(struct task_struct *tsk, unsigned long addr,
3697 void *buf, int len, int write)
3698{
3699 struct mm_struct *mm;
3700 int ret;
3701
3702 mm = get_task_mm(tsk);
3703 if (!mm)
3704 return 0;
3705
3706 ret = __access_remote_vm(tsk, mm, addr, buf, len, write);
3707 mmput(mm);
3708
3709 return ret;
3710}
3711
3712
3713
3714
3715void print_vma_addr(char *prefix, unsigned long ip)
3716{
3717 struct mm_struct *mm = current->mm;
3718 struct vm_area_struct *vma;
3719
3720
3721
3722
3723
3724 if (preempt_count())
3725 return;
3726
3727 down_read(&mm->mmap_sem);
3728 vma = find_vma(mm, ip);
3729 if (vma && vma->vm_file) {
3730 struct file *f = vma->vm_file;
3731 char *buf = (char *)__get_free_page(GFP_KERNEL);
3732 if (buf) {
3733 char *p;
3734
3735 p = file_path(f, buf, PAGE_SIZE);
3736 if (IS_ERR(p))
3737 p = "?";
3738 printk("%s%s[%lx+%lx]", prefix, kbasename(p),
3739 vma->vm_start,
3740 vma->vm_end - vma->vm_start);
3741 free_page((unsigned long)buf);
3742 }
3743 }
3744 up_read(&mm->mmap_sem);
3745}
3746
3747#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
3748void __might_fault(const char *file, int line)
3749{
3750
3751
3752
3753
3754
3755
3756 if (segment_eq(get_fs(), KERNEL_DS))
3757 return;
3758 if (pagefault_disabled())
3759 return;
3760 __might_sleep(file, line, 0);
3761#if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
3762 if (current->mm)
3763 might_lock_read(¤t->mm->mmap_sem);
3764#endif
3765}
3766EXPORT_SYMBOL(__might_fault);
3767#endif
3768
3769#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
3770static void clear_gigantic_page(struct page *page,
3771 unsigned long addr,
3772 unsigned int pages_per_huge_page)
3773{
3774 int i;
3775 struct page *p = page;
3776
3777 might_sleep();
3778 for (i = 0; i < pages_per_huge_page;
3779 i++, p = mem_map_next(p, page, i)) {
3780 cond_resched();
3781 clear_user_highpage(p, addr + i * PAGE_SIZE);
3782 }
3783}
3784void clear_huge_page(struct page *page,
3785 unsigned long addr, unsigned int pages_per_huge_page)
3786{
3787 int i;
3788
3789 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
3790 clear_gigantic_page(page, addr, pages_per_huge_page);
3791 return;
3792 }
3793
3794 might_sleep();
3795 for (i = 0; i < pages_per_huge_page; i++) {
3796 cond_resched();
3797 clear_user_highpage(page + i, addr + i * PAGE_SIZE);
3798 }
3799}
3800
3801static void copy_user_gigantic_page(struct page *dst, struct page *src,
3802 unsigned long addr,
3803 struct vm_area_struct *vma,
3804 unsigned int pages_per_huge_page)
3805{
3806 int i;
3807 struct page *dst_base = dst;
3808 struct page *src_base = src;
3809
3810 for (i = 0; i < pages_per_huge_page; ) {
3811 cond_resched();
3812 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
3813
3814 i++;
3815 dst = mem_map_next(dst, dst_base, i);
3816 src = mem_map_next(src, src_base, i);
3817 }
3818}
3819
3820void copy_user_huge_page(struct page *dst, struct page *src,
3821 unsigned long addr, struct vm_area_struct *vma,
3822 unsigned int pages_per_huge_page)
3823{
3824 int i;
3825
3826 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
3827 copy_user_gigantic_page(dst, src, addr, vma,
3828 pages_per_huge_page);
3829 return;
3830 }
3831
3832 might_sleep();
3833 for (i = 0; i < pages_per_huge_page; i++) {
3834 cond_resched();
3835 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
3836 }
3837}
3838#endif
3839
3840#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS
3841
3842static struct kmem_cache *page_ptl_cachep;
3843
3844void __init ptlock_cache_init(void)
3845{
3846 page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0,
3847 SLAB_PANIC, NULL);
3848}
3849
3850bool ptlock_alloc(struct page *page)
3851{
3852 spinlock_t *ptl;
3853
3854 ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
3855 if (!ptl)
3856 return false;
3857 page->ptl = ptl;
3858 return true;
3859}
3860
3861void ptlock_free(struct page *page)
3862{
3863 kmem_cache_free(page_ptl_cachep, page->ptl);
3864}
3865#endif
3866