1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41#include <linux/kernel_stat.h>
42#include <linux/mm.h>
43#include <linux/hugetlb.h>
44#include <linux/mman.h>
45#include <linux/swap.h>
46#include <linux/highmem.h>
47#include <linux/pagemap.h>
48#include <linux/ksm.h>
49#include <linux/rmap.h>
50#include <linux/module.h>
51#include <linux/delayacct.h>
52#include <linux/init.h>
53#include <linux/writeback.h>
54#include <linux/memcontrol.h>
55#include <linux/mmu_notifier.h>
56#include <linux/kallsyms.h>
57#include <linux/swapops.h>
58#include <linux/elf.h>
59#include <linux/gfp.h>
60
61#include <asm/io.h>
62#include <asm/pgalloc.h>
63#include <asm/uaccess.h>
64#include <asm/tlb.h>
65#include <asm/tlbflush.h>
66#include <asm/pgtable.h>
67
68#include "internal.h"
69
70#ifndef CONFIG_NEED_MULTIPLE_NODES
71
72unsigned long max_mapnr;
73struct page *mem_map;
74
75EXPORT_SYMBOL(max_mapnr);
76EXPORT_SYMBOL(mem_map);
77#endif
78
79unsigned long num_physpages;
80
81
82
83
84
85
86
87void * high_memory;
88
89EXPORT_SYMBOL(num_physpages);
90EXPORT_SYMBOL(high_memory);
91
92
93
94
95
96
97
98int randomize_va_space __read_mostly =
99#ifdef CONFIG_COMPAT_BRK
100 1;
101#else
102 2;
103#endif
104
105static int __init disable_randmaps(char *s)
106{
107 randomize_va_space = 0;
108 return 1;
109}
110__setup("norandmaps", disable_randmaps);
111
112unsigned long zero_pfn __read_mostly;
113unsigned long highest_memmap_pfn __read_mostly;
114
115
116
117
118static int __init init_zero_pfn(void)
119{
120 zero_pfn = page_to_pfn(ZERO_PAGE(0));
121 return 0;
122}
123core_initcall(init_zero_pfn);
124
125
126#if defined(SPLIT_RSS_COUNTING)
127
128static void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm)
129{
130 int i;
131
132 for (i = 0; i < NR_MM_COUNTERS; i++) {
133 if (task->rss_stat.count[i]) {
134 add_mm_counter(mm, i, task->rss_stat.count[i]);
135 task->rss_stat.count[i] = 0;
136 }
137 }
138 task->rss_stat.events = 0;
139}
140
141static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
142{
143 struct task_struct *task = current;
144
145 if (likely(task->mm == mm))
146 task->rss_stat.count[member] += val;
147 else
148 add_mm_counter(mm, member, val);
149}
150#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
151#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
152
153
154#define TASK_RSS_EVENTS_THRESH (64)
155static void check_sync_rss_stat(struct task_struct *task)
156{
157 if (unlikely(task != current))
158 return;
159 if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
160 __sync_task_rss_stat(task, task->mm);
161}
162
163unsigned long get_mm_counter(struct mm_struct *mm, int member)
164{
165 long val = 0;
166
167
168
169
170
171 val = atomic_long_read(&mm->rss_stat.count[member]);
172
173
174
175
176 if (val < 0)
177 return 0;
178 return (unsigned long)val;
179}
180
181void sync_mm_rss(struct task_struct *task, struct mm_struct *mm)
182{
183 __sync_task_rss_stat(task, mm);
184}
185#else
186
187#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
188#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
189
190static void check_sync_rss_stat(struct task_struct *task)
191{
192}
193
194#endif
195
196
197
198
199
200
201
202void pgd_clear_bad(pgd_t *pgd)
203{
204 pgd_ERROR(*pgd);
205 pgd_clear(pgd);
206}
207
208void pud_clear_bad(pud_t *pud)
209{
210 pud_ERROR(*pud);
211 pud_clear(pud);
212}
213
214void pmd_clear_bad(pmd_t *pmd)
215{
216 pmd_ERROR(*pmd);
217 pmd_clear(pmd);
218}
219
220
221
222
223
224static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
225 unsigned long addr)
226{
227 pgtable_t token = pmd_pgtable(*pmd);
228 pmd_clear(pmd);
229 pte_free_tlb(tlb, token, addr);
230 tlb->mm->nr_ptes--;
231}
232
233static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
234 unsigned long addr, unsigned long end,
235 unsigned long floor, unsigned long ceiling)
236{
237 pmd_t *pmd;
238 unsigned long next;
239 unsigned long start;
240
241 start = addr;
242 pmd = pmd_offset(pud, addr);
243 do {
244 next = pmd_addr_end(addr, end);
245 if (pmd_none_or_clear_bad(pmd))
246 continue;
247 free_pte_range(tlb, pmd, addr);
248 } while (pmd++, addr = next, addr != end);
249
250 start &= PUD_MASK;
251 if (start < floor)
252 return;
253 if (ceiling) {
254 ceiling &= PUD_MASK;
255 if (!ceiling)
256 return;
257 }
258 if (end - 1 > ceiling - 1)
259 return;
260
261 pmd = pmd_offset(pud, start);
262 pud_clear(pud);
263 pmd_free_tlb(tlb, pmd, start);
264}
265
266static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
267 unsigned long addr, unsigned long end,
268 unsigned long floor, unsigned long ceiling)
269{
270 pud_t *pud;
271 unsigned long next;
272 unsigned long start;
273
274 start = addr;
275 pud = pud_offset(pgd, addr);
276 do {
277 next = pud_addr_end(addr, end);
278 if (pud_none_or_clear_bad(pud))
279 continue;
280 free_pmd_range(tlb, pud, addr, next, floor, ceiling);
281 } while (pud++, addr = next, addr != end);
282
283 start &= PGDIR_MASK;
284 if (start < floor)
285 return;
286 if (ceiling) {
287 ceiling &= PGDIR_MASK;
288 if (!ceiling)
289 return;
290 }
291 if (end - 1 > ceiling - 1)
292 return;
293
294 pud = pud_offset(pgd, start);
295 pgd_clear(pgd);
296 pud_free_tlb(tlb, pud, start);
297}
298
299
300
301
302
303
304void free_pgd_range(struct mmu_gather *tlb,
305 unsigned long addr, unsigned long end,
306 unsigned long floor, unsigned long ceiling)
307{
308 pgd_t *pgd;
309 unsigned long next;
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337 addr &= PMD_MASK;
338 if (addr < floor) {
339 addr += PMD_SIZE;
340 if (!addr)
341 return;
342 }
343 if (ceiling) {
344 ceiling &= PMD_MASK;
345 if (!ceiling)
346 return;
347 }
348 if (end - 1 > ceiling - 1)
349 end -= PMD_SIZE;
350 if (addr > end - 1)
351 return;
352
353 pgd = pgd_offset(tlb->mm, addr);
354 do {
355 next = pgd_addr_end(addr, end);
356 if (pgd_none_or_clear_bad(pgd))
357 continue;
358 free_pud_range(tlb, pgd, addr, next, floor, ceiling);
359 } while (pgd++, addr = next, addr != end);
360}
361
362void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
363 unsigned long floor, unsigned long ceiling)
364{
365 while (vma) {
366 struct vm_area_struct *next = vma->vm_next;
367 unsigned long addr = vma->vm_start;
368
369
370
371
372
373 unlink_anon_vmas(vma);
374 unlink_file_vma(vma);
375
376 if (is_vm_hugetlb_page(vma)) {
377 hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
378 floor, next? next->vm_start: ceiling);
379 } else {
380
381
382
383 while (next && next->vm_start <= vma->vm_end + PMD_SIZE
384 && !is_vm_hugetlb_page(next)) {
385 vma = next;
386 next = vma->vm_next;
387 unlink_anon_vmas(vma);
388 unlink_file_vma(vma);
389 }
390 free_pgd_range(tlb, addr, vma->vm_end,
391 floor, next? next->vm_start: ceiling);
392 }
393 vma = next;
394 }
395}
396
397int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
398 pmd_t *pmd, unsigned long address)
399{
400 pgtable_t new = pte_alloc_one(mm, address);
401 int wait_split_huge_page;
402 if (!new)
403 return -ENOMEM;
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418 smp_wmb();
419
420 spin_lock(&mm->page_table_lock);
421 wait_split_huge_page = 0;
422 if (likely(pmd_none(*pmd))) {
423 mm->nr_ptes++;
424 pmd_populate(mm, pmd, new);
425 new = NULL;
426 } else if (unlikely(pmd_trans_splitting(*pmd)))
427 wait_split_huge_page = 1;
428 spin_unlock(&mm->page_table_lock);
429 if (new)
430 pte_free(mm, new);
431 if (wait_split_huge_page)
432 wait_split_huge_page(vma->anon_vma, pmd);
433 return 0;
434}
435
436int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
437{
438 pte_t *new = pte_alloc_one_kernel(&init_mm, address);
439 if (!new)
440 return -ENOMEM;
441
442 smp_wmb();
443
444 spin_lock(&init_mm.page_table_lock);
445 if (likely(pmd_none(*pmd))) {
446 pmd_populate_kernel(&init_mm, pmd, new);
447 new = NULL;
448 } else
449 VM_BUG_ON(pmd_trans_splitting(*pmd));
450 spin_unlock(&init_mm.page_table_lock);
451 if (new)
452 pte_free_kernel(&init_mm, new);
453 return 0;
454}
455
456static inline void init_rss_vec(int *rss)
457{
458 memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
459}
460
461static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
462{
463 int i;
464
465 if (current->mm == mm)
466 sync_mm_rss(current, mm);
467 for (i = 0; i < NR_MM_COUNTERS; i++)
468 if (rss[i])
469 add_mm_counter(mm, i, rss[i]);
470}
471
472
473
474
475
476
477
478
479static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
480 pte_t pte, struct page *page)
481{
482 pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
483 pud_t *pud = pud_offset(pgd, addr);
484 pmd_t *pmd = pmd_offset(pud, addr);
485 struct address_space *mapping;
486 pgoff_t index;
487 static unsigned long resume;
488 static unsigned long nr_shown;
489 static unsigned long nr_unshown;
490
491
492
493
494
495 if (nr_shown == 60) {
496 if (time_before(jiffies, resume)) {
497 nr_unshown++;
498 return;
499 }
500 if (nr_unshown) {
501 printk(KERN_ALERT
502 "BUG: Bad page map: %lu messages suppressed\n",
503 nr_unshown);
504 nr_unshown = 0;
505 }
506 nr_shown = 0;
507 }
508 if (nr_shown++ == 0)
509 resume = jiffies + 60 * HZ;
510
511 mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
512 index = linear_page_index(vma, addr);
513
514 printk(KERN_ALERT
515 "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
516 current->comm,
517 (long long)pte_val(pte), (long long)pmd_val(*pmd));
518 if (page)
519 dump_page(page);
520 printk(KERN_ALERT
521 "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
522 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
523
524
525
526 if (vma->vm_ops)
527 print_symbol(KERN_ALERT "vma->vm_ops->fault: %s\n",
528 (unsigned long)vma->vm_ops->fault);
529 if (vma->vm_file && vma->vm_file->f_op)
530 print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n",
531 (unsigned long)vma->vm_file->f_op->mmap);
532 dump_stack();
533 add_taint(TAINT_BAD_PAGE);
534}
535
536static inline int is_cow_mapping(unsigned int flags)
537{
538 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
539}
540
541#ifndef is_zero_pfn
542static inline int is_zero_pfn(unsigned long pfn)
543{
544 return pfn == zero_pfn;
545}
546#endif
547
548#ifndef my_zero_pfn
549static inline unsigned long my_zero_pfn(unsigned long addr)
550{
551 return zero_pfn;
552}
553#endif
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597#ifdef __HAVE_ARCH_PTE_SPECIAL
598# define HAVE_PTE_SPECIAL 1
599#else
600# define HAVE_PTE_SPECIAL 0
601#endif
602struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
603 pte_t pte)
604{
605 unsigned long pfn = pte_pfn(pte);
606
607 if (HAVE_PTE_SPECIAL) {
608 if (likely(!pte_special(pte)))
609 goto check_pfn;
610 if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
611 return NULL;
612 if (!is_zero_pfn(pfn))
613 print_bad_pte(vma, addr, pte, NULL);
614 return NULL;
615 }
616
617
618
619 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
620 if (vma->vm_flags & VM_MIXEDMAP) {
621 if (!pfn_valid(pfn))
622 return NULL;
623 goto out;
624 } else {
625 unsigned long off;
626 off = (addr - vma->vm_start) >> PAGE_SHIFT;
627 if (pfn == vma->vm_pgoff + off)
628 return NULL;
629 if (!is_cow_mapping(vma->vm_flags))
630 return NULL;
631 }
632 }
633
634 if (is_zero_pfn(pfn))
635 return NULL;
636check_pfn:
637 if (unlikely(pfn > highest_memmap_pfn)) {
638 print_bad_pte(vma, addr, pte, NULL);
639 return NULL;
640 }
641
642
643
644
645
646out:
647 return pfn_to_page(pfn);
648}
649
650
651
652
653
654
655
656static inline unsigned long
657copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
658 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
659 unsigned long addr, int *rss)
660{
661 unsigned long vm_flags = vma->vm_flags;
662 pte_t pte = *src_pte;
663 struct page *page;
664
665
666 if (unlikely(!pte_present(pte))) {
667 if (!pte_file(pte)) {
668 swp_entry_t entry = pte_to_swp_entry(pte);
669
670 if (swap_duplicate(entry) < 0)
671 return entry.val;
672
673
674 if (unlikely(list_empty(&dst_mm->mmlist))) {
675 spin_lock(&mmlist_lock);
676 if (list_empty(&dst_mm->mmlist))
677 list_add(&dst_mm->mmlist,
678 &src_mm->mmlist);
679 spin_unlock(&mmlist_lock);
680 }
681 if (likely(!non_swap_entry(entry)))
682 rss[MM_SWAPENTS]++;
683 else if (is_write_migration_entry(entry) &&
684 is_cow_mapping(vm_flags)) {
685
686
687
688
689 make_migration_entry_read(&entry);
690 pte = swp_entry_to_pte(entry);
691 set_pte_at(src_mm, addr, src_pte, pte);
692 }
693 }
694 goto out_set_pte;
695 }
696
697
698
699
700
701 if (is_cow_mapping(vm_flags)) {
702 ptep_set_wrprotect(src_mm, addr, src_pte);
703 pte = pte_wrprotect(pte);
704 }
705
706
707
708
709
710 if (vm_flags & VM_SHARED)
711 pte = pte_mkclean(pte);
712 pte = pte_mkold(pte);
713
714 page = vm_normal_page(vma, addr, pte);
715 if (page) {
716 get_page(page);
717 page_dup_rmap(page);
718 if (PageAnon(page))
719 rss[MM_ANONPAGES]++;
720 else
721 rss[MM_FILEPAGES]++;
722 }
723
724out_set_pte:
725 set_pte_at(dst_mm, addr, dst_pte, pte);
726 return 0;
727}
728
729int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
730 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
731 unsigned long addr, unsigned long end)
732{
733 pte_t *orig_src_pte, *orig_dst_pte;
734 pte_t *src_pte, *dst_pte;
735 spinlock_t *src_ptl, *dst_ptl;
736 int progress = 0;
737 int rss[NR_MM_COUNTERS];
738 swp_entry_t entry = (swp_entry_t){0};
739
740again:
741 init_rss_vec(rss);
742
743 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
744 if (!dst_pte)
745 return -ENOMEM;
746 src_pte = pte_offset_map(src_pmd, addr);
747 src_ptl = pte_lockptr(src_mm, src_pmd);
748 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
749 orig_src_pte = src_pte;
750 orig_dst_pte = dst_pte;
751 arch_enter_lazy_mmu_mode();
752
753 do {
754
755
756
757
758 if (progress >= 32) {
759 progress = 0;
760 if (need_resched() ||
761 spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
762 break;
763 }
764 if (pte_none(*src_pte)) {
765 progress++;
766 continue;
767 }
768 entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
769 vma, addr, rss);
770 if (entry.val)
771 break;
772 progress += 8;
773 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
774
775 arch_leave_lazy_mmu_mode();
776 spin_unlock(src_ptl);
777 pte_unmap(orig_src_pte);
778 add_mm_rss_vec(dst_mm, rss);
779 pte_unmap_unlock(orig_dst_pte, dst_ptl);
780 cond_resched();
781
782 if (entry.val) {
783 if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
784 return -ENOMEM;
785 progress = 0;
786 }
787 if (addr != end)
788 goto again;
789 return 0;
790}
791
792static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
793 pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
794 unsigned long addr, unsigned long end)
795{
796 pmd_t *src_pmd, *dst_pmd;
797 unsigned long next;
798
799 dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
800 if (!dst_pmd)
801 return -ENOMEM;
802 src_pmd = pmd_offset(src_pud, addr);
803 do {
804 next = pmd_addr_end(addr, end);
805 if (pmd_trans_huge(*src_pmd)) {
806 int err;
807 VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);
808 err = copy_huge_pmd(dst_mm, src_mm,
809 dst_pmd, src_pmd, addr, vma);
810 if (err == -ENOMEM)
811 return -ENOMEM;
812 if (!err)
813 continue;
814
815 }
816 if (pmd_none_or_clear_bad(src_pmd))
817 continue;
818 if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
819 vma, addr, next))
820 return -ENOMEM;
821 } while (dst_pmd++, src_pmd++, addr = next, addr != end);
822 return 0;
823}
824
825static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
826 pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
827 unsigned long addr, unsigned long end)
828{
829 pud_t *src_pud, *dst_pud;
830 unsigned long next;
831
832 dst_pud = pud_alloc(dst_mm, dst_pgd, addr);
833 if (!dst_pud)
834 return -ENOMEM;
835 src_pud = pud_offset(src_pgd, addr);
836 do {
837 next = pud_addr_end(addr, end);
838 if (pud_none_or_clear_bad(src_pud))
839 continue;
840 if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
841 vma, addr, next))
842 return -ENOMEM;
843 } while (dst_pud++, src_pud++, addr = next, addr != end);
844 return 0;
845}
846
847int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
848 struct vm_area_struct *vma)
849{
850 pgd_t *src_pgd, *dst_pgd;
851 unsigned long next;
852 unsigned long addr = vma->vm_start;
853 unsigned long end = vma->vm_end;
854 int ret;
855
856
857
858
859
860
861
862 if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) {
863 if (!vma->anon_vma)
864 return 0;
865 }
866
867 if (is_vm_hugetlb_page(vma))
868 return copy_hugetlb_page_range(dst_mm, src_mm, vma);
869
870 if (unlikely(is_pfn_mapping(vma))) {
871
872
873
874
875 ret = track_pfn_vma_copy(vma);
876 if (ret)
877 return ret;
878 }
879
880
881
882
883
884
885
886 if (is_cow_mapping(vma->vm_flags))
887 mmu_notifier_invalidate_range_start(src_mm, addr, end);
888
889 ret = 0;
890 dst_pgd = pgd_offset(dst_mm, addr);
891 src_pgd = pgd_offset(src_mm, addr);
892 do {
893 next = pgd_addr_end(addr, end);
894 if (pgd_none_or_clear_bad(src_pgd))
895 continue;
896 if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
897 vma, addr, next))) {
898 ret = -ENOMEM;
899 break;
900 }
901 } while (dst_pgd++, src_pgd++, addr = next, addr != end);
902
903 if (is_cow_mapping(vma->vm_flags))
904 mmu_notifier_invalidate_range_end(src_mm,
905 vma->vm_start, end);
906 return ret;
907}
908
909static unsigned long zap_pte_range(struct mmu_gather *tlb,
910 struct vm_area_struct *vma, pmd_t *pmd,
911 unsigned long addr, unsigned long end,
912 long *zap_work, struct zap_details *details)
913{
914 struct mm_struct *mm = tlb->mm;
915 pte_t *pte;
916 spinlock_t *ptl;
917 int rss[NR_MM_COUNTERS];
918
919 init_rss_vec(rss);
920
921 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
922 arch_enter_lazy_mmu_mode();
923 do {
924 pte_t ptent = *pte;
925 if (pte_none(ptent)) {
926 (*zap_work)--;
927 continue;
928 }
929
930 (*zap_work) -= PAGE_SIZE;
931
932 if (pte_present(ptent)) {
933 struct page *page;
934
935 page = vm_normal_page(vma, addr, ptent);
936 if (unlikely(details) && page) {
937
938
939
940
941
942 if (details->check_mapping &&
943 details->check_mapping != page->mapping)
944 continue;
945
946
947
948
949 if (details->nonlinear_vma &&
950 (page->index < details->first_index ||
951 page->index > details->last_index))
952 continue;
953 }
954 ptent = ptep_get_and_clear_full(mm, addr, pte,
955 tlb->fullmm);
956 tlb_remove_tlb_entry(tlb, pte, addr);
957 if (unlikely(!page))
958 continue;
959 if (unlikely(details) && details->nonlinear_vma
960 && linear_page_index(details->nonlinear_vma,
961 addr) != page->index)
962 set_pte_at(mm, addr, pte,
963 pgoff_to_pte(page->index));
964 if (PageAnon(page))
965 rss[MM_ANONPAGES]--;
966 else {
967 if (pte_dirty(ptent))
968 set_page_dirty(page);
969 if (pte_young(ptent) &&
970 likely(!VM_SequentialReadHint(vma)))
971 mark_page_accessed(page);
972 rss[MM_FILEPAGES]--;
973 }
974 page_remove_rmap(page);
975 if (unlikely(page_mapcount(page) < 0))
976 print_bad_pte(vma, addr, ptent, page);
977 tlb_remove_page(tlb, page);
978 continue;
979 }
980
981
982
983
984 if (unlikely(details))
985 continue;
986 if (pte_file(ptent)) {
987 if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
988 print_bad_pte(vma, addr, ptent, NULL);
989 } else {
990 swp_entry_t entry = pte_to_swp_entry(ptent);
991
992 if (!non_swap_entry(entry))
993 rss[MM_SWAPENTS]--;
994 if (unlikely(!free_swap_and_cache(entry)))
995 print_bad_pte(vma, addr, ptent, NULL);
996 }
997 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
998 } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
999
1000 add_mm_rss_vec(mm, rss);
1001 arch_leave_lazy_mmu_mode();
1002 pte_unmap_unlock(pte - 1, ptl);
1003
1004 return addr;
1005}
1006
1007static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1008 struct vm_area_struct *vma, pud_t *pud,
1009 unsigned long addr, unsigned long end,
1010 long *zap_work, struct zap_details *details)
1011{
1012 pmd_t *pmd;
1013 unsigned long next;
1014
1015 pmd = pmd_offset(pud, addr);
1016 do {
1017 next = pmd_addr_end(addr, end);
1018 if (pmd_trans_huge(*pmd)) {
1019 if (next-addr != HPAGE_PMD_SIZE) {
1020 VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
1021 split_huge_page_pmd(vma->vm_mm, pmd);
1022 } else if (zap_huge_pmd(tlb, vma, pmd)) {
1023 (*zap_work)--;
1024 continue;
1025 }
1026
1027 }
1028 if (pmd_none_or_clear_bad(pmd)) {
1029 (*zap_work)--;
1030 continue;
1031 }
1032 next = zap_pte_range(tlb, vma, pmd, addr, next,
1033 zap_work, details);
1034 } while (pmd++, addr = next, (addr != end && *zap_work > 0));
1035
1036 return addr;
1037}
1038
1039static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1040 struct vm_area_struct *vma, pgd_t *pgd,
1041 unsigned long addr, unsigned long end,
1042 long *zap_work, struct zap_details *details)
1043{
1044 pud_t *pud;
1045 unsigned long next;
1046
1047 pud = pud_offset(pgd, addr);
1048 do {
1049 next = pud_addr_end(addr, end);
1050 if (pud_none_or_clear_bad(pud)) {
1051 (*zap_work)--;
1052 continue;
1053 }
1054 next = zap_pmd_range(tlb, vma, pud, addr, next,
1055 zap_work, details);
1056 } while (pud++, addr = next, (addr != end && *zap_work > 0));
1057
1058 return addr;
1059}
1060
1061static unsigned long unmap_page_range(struct mmu_gather *tlb,
1062 struct vm_area_struct *vma,
1063 unsigned long addr, unsigned long end,
1064 long *zap_work, struct zap_details *details)
1065{
1066 pgd_t *pgd;
1067 unsigned long next;
1068
1069 if (details && !details->check_mapping && !details->nonlinear_vma)
1070 details = NULL;
1071
1072 BUG_ON(addr >= end);
1073 mem_cgroup_uncharge_start();
1074 tlb_start_vma(tlb, vma);
1075 pgd = pgd_offset(vma->vm_mm, addr);
1076 do {
1077 next = pgd_addr_end(addr, end);
1078 if (pgd_none_or_clear_bad(pgd)) {
1079 (*zap_work)--;
1080 continue;
1081 }
1082 next = zap_pud_range(tlb, vma, pgd, addr, next,
1083 zap_work, details);
1084 } while (pgd++, addr = next, (addr != end && *zap_work > 0));
1085 tlb_end_vma(tlb, vma);
1086 mem_cgroup_uncharge_end();
1087
1088 return addr;
1089}
1090
1091#ifdef CONFIG_PREEMPT
1092# define ZAP_BLOCK_SIZE (8 * PAGE_SIZE)
1093#else
1094
1095# define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE)
1096#endif
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124unsigned long unmap_vmas(struct mmu_gather **tlbp,
1125 struct vm_area_struct *vma, unsigned long start_addr,
1126 unsigned long end_addr, unsigned long *nr_accounted,
1127 struct zap_details *details)
1128{
1129 long zap_work = ZAP_BLOCK_SIZE;
1130 unsigned long tlb_start = 0;
1131 int tlb_start_valid = 0;
1132 unsigned long start = start_addr;
1133 spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
1134 int fullmm = (*tlbp)->fullmm;
1135 struct mm_struct *mm = vma->vm_mm;
1136
1137 mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
1138 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
1139 unsigned long end;
1140
1141 start = max(vma->vm_start, start_addr);
1142 if (start >= vma->vm_end)
1143 continue;
1144 end = min(vma->vm_end, end_addr);
1145 if (end <= vma->vm_start)
1146 continue;
1147
1148 if (vma->vm_flags & VM_ACCOUNT)
1149 *nr_accounted += (end - start) >> PAGE_SHIFT;
1150
1151 if (unlikely(is_pfn_mapping(vma)))
1152 untrack_pfn_vma(vma, 0, 0);
1153
1154 while (start != end) {
1155 if (!tlb_start_valid) {
1156 tlb_start = start;
1157 tlb_start_valid = 1;
1158 }
1159
1160 if (unlikely(is_vm_hugetlb_page(vma))) {
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172 if (vma->vm_file) {
1173 unmap_hugepage_range(vma, start, end, NULL);
1174 zap_work -= (end - start) /
1175 pages_per_huge_page(hstate_vma(vma));
1176 }
1177
1178 start = end;
1179 } else
1180 start = unmap_page_range(*tlbp, vma,
1181 start, end, &zap_work, details);
1182
1183 if (zap_work > 0) {
1184 BUG_ON(start != end);
1185 break;
1186 }
1187
1188 tlb_finish_mmu(*tlbp, tlb_start, start);
1189
1190 if (need_resched() ||
1191 (i_mmap_lock && spin_needbreak(i_mmap_lock))) {
1192 if (i_mmap_lock) {
1193 *tlbp = NULL;
1194 goto out;
1195 }
1196 cond_resched();
1197 }
1198
1199 *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
1200 tlb_start_valid = 0;
1201 zap_work = ZAP_BLOCK_SIZE;
1202 }
1203 }
1204out:
1205 mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
1206 return start;
1207}
1208
1209
1210
1211
1212
1213
1214
1215
1216unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
1217 unsigned long size, struct zap_details *details)
1218{
1219 struct mm_struct *mm = vma->vm_mm;
1220 struct mmu_gather *tlb;
1221 unsigned long end = address + size;
1222 unsigned long nr_accounted = 0;
1223
1224 lru_add_drain();
1225 tlb = tlb_gather_mmu(mm, 0);
1226 update_hiwater_rss(mm);
1227 end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
1228 if (tlb)
1229 tlb_finish_mmu(tlb, address, end);
1230 return end;
1231}
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
1246 unsigned long size)
1247{
1248 if (address < vma->vm_start || address + size > vma->vm_end ||
1249 !(vma->vm_flags & VM_PFNMAP))
1250 return -1;
1251 zap_page_range(vma, address, size, NULL);
1252 return 0;
1253}
1254EXPORT_SYMBOL_GPL(zap_vma_ptes);
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1269 unsigned int flags)
1270{
1271 pgd_t *pgd;
1272 pud_t *pud;
1273 pmd_t *pmd;
1274 pte_t *ptep, pte;
1275 spinlock_t *ptl;
1276 struct page *page;
1277 struct mm_struct *mm = vma->vm_mm;
1278
1279 page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
1280 if (!IS_ERR(page)) {
1281 BUG_ON(flags & FOLL_GET);
1282 goto out;
1283 }
1284
1285 page = NULL;
1286 pgd = pgd_offset(mm, address);
1287 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
1288 goto no_page_table;
1289
1290 pud = pud_offset(pgd, address);
1291 if (pud_none(*pud))
1292 goto no_page_table;
1293 if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
1294 BUG_ON(flags & FOLL_GET);
1295 page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
1296 goto out;
1297 }
1298 if (unlikely(pud_bad(*pud)))
1299 goto no_page_table;
1300
1301 pmd = pmd_offset(pud, address);
1302 if (pmd_none(*pmd))
1303 goto no_page_table;
1304 if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
1305 BUG_ON(flags & FOLL_GET);
1306 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
1307 goto out;
1308 }
1309 if (pmd_trans_huge(*pmd)) {
1310 if (flags & FOLL_SPLIT) {
1311 split_huge_page_pmd(mm, pmd);
1312 goto split_fallthrough;
1313 }
1314 spin_lock(&mm->page_table_lock);
1315 if (likely(pmd_trans_huge(*pmd))) {
1316 if (unlikely(pmd_trans_splitting(*pmd))) {
1317 spin_unlock(&mm->page_table_lock);
1318 wait_split_huge_page(vma->anon_vma, pmd);
1319 } else {
1320 page = follow_trans_huge_pmd(mm, address,
1321 pmd, flags);
1322 spin_unlock(&mm->page_table_lock);
1323 goto out;
1324 }
1325 } else
1326 spin_unlock(&mm->page_table_lock);
1327
1328 }
1329split_fallthrough:
1330 if (unlikely(pmd_bad(*pmd)))
1331 goto no_page_table;
1332
1333 ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
1334
1335 pte = *ptep;
1336 if (!pte_present(pte))
1337 goto no_page;
1338 if ((flags & FOLL_WRITE) && !pte_write(pte))
1339 goto unlock;
1340
1341 page = vm_normal_page(vma, address, pte);
1342 if (unlikely(!page)) {
1343 if ((flags & FOLL_DUMP) ||
1344 !is_zero_pfn(pte_pfn(pte)))
1345 goto bad_page;
1346 page = pte_page(pte);
1347 }
1348
1349 if (flags & FOLL_GET)
1350 get_page(page);
1351 if (flags & FOLL_TOUCH) {
1352 if ((flags & FOLL_WRITE) &&
1353 !pte_dirty(pte) && !PageDirty(page))
1354 set_page_dirty(page);
1355
1356
1357
1358
1359
1360 mark_page_accessed(page);
1361 }
1362 if (flags & FOLL_MLOCK) {
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372 if (page->mapping && trylock_page(page)) {
1373 lru_add_drain();
1374
1375
1376
1377
1378
1379 if (page->mapping)
1380 mlock_vma_page(page);
1381 unlock_page(page);
1382 }
1383 }
1384unlock:
1385 pte_unmap_unlock(ptep, ptl);
1386out:
1387 return page;
1388
1389bad_page:
1390 pte_unmap_unlock(ptep, ptl);
1391 return ERR_PTR(-EFAULT);
1392
1393no_page:
1394 pte_unmap_unlock(ptep, ptl);
1395 if (!pte_none(pte))
1396 return page;
1397
1398no_page_table:
1399
1400
1401
1402
1403
1404
1405
1406
1407 if ((flags & FOLL_DUMP) &&
1408 (!vma->vm_ops || !vma->vm_ops->fault))
1409 return ERR_PTR(-EFAULT);
1410 return page;
1411}
1412
1413int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1414 unsigned long start, int nr_pages, unsigned int gup_flags,
1415 struct page **pages, struct vm_area_struct **vmas,
1416 int *nonblocking)
1417{
1418 int i;
1419 unsigned long vm_flags;
1420
1421 if (nr_pages <= 0)
1422 return 0;
1423
1424 VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
1425
1426
1427
1428
1429
1430 vm_flags = (gup_flags & FOLL_WRITE) ?
1431 (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
1432 vm_flags &= (gup_flags & FOLL_FORCE) ?
1433 (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
1434 i = 0;
1435
1436 do {
1437 struct vm_area_struct *vma;
1438
1439 vma = find_extend_vma(mm, start);
1440 if (!vma && in_gate_area(tsk, start)) {
1441 unsigned long pg = start & PAGE_MASK;
1442 struct vm_area_struct *gate_vma = get_gate_vma(tsk);
1443 pgd_t *pgd;
1444 pud_t *pud;
1445 pmd_t *pmd;
1446 pte_t *pte;
1447
1448
1449 if (gup_flags & FOLL_WRITE)
1450 return i ? : -EFAULT;
1451 if (pg > TASK_SIZE)
1452 pgd = pgd_offset_k(pg);
1453 else
1454 pgd = pgd_offset_gate(mm, pg);
1455 BUG_ON(pgd_none(*pgd));
1456 pud = pud_offset(pgd, pg);
1457 BUG_ON(pud_none(*pud));
1458 pmd = pmd_offset(pud, pg);
1459 if (pmd_none(*pmd))
1460 return i ? : -EFAULT;
1461 VM_BUG_ON(pmd_trans_huge(*pmd));
1462 pte = pte_offset_map(pmd, pg);
1463 if (pte_none(*pte)) {
1464 pte_unmap(pte);
1465 return i ? : -EFAULT;
1466 }
1467 if (pages) {
1468 struct page *page;
1469
1470 page = vm_normal_page(gate_vma, start, *pte);
1471 if (!page) {
1472 if (!(gup_flags & FOLL_DUMP) &&
1473 is_zero_pfn(pte_pfn(*pte)))
1474 page = pte_page(*pte);
1475 else {
1476 pte_unmap(pte);
1477 return i ? : -EFAULT;
1478 }
1479 }
1480 pages[i] = page;
1481 get_page(page);
1482 }
1483 pte_unmap(pte);
1484 if (vmas)
1485 vmas[i] = gate_vma;
1486 i++;
1487 start += PAGE_SIZE;
1488 nr_pages--;
1489 continue;
1490 }
1491
1492 if (!vma ||
1493 (vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
1494 !(vm_flags & vma->vm_flags))
1495 return i ? : -EFAULT;
1496
1497 if (is_vm_hugetlb_page(vma)) {
1498 i = follow_hugetlb_page(mm, vma, pages, vmas,
1499 &start, &nr_pages, i, gup_flags);
1500 continue;
1501 }
1502
1503 do {
1504 struct page *page;
1505 unsigned int foll_flags = gup_flags;
1506
1507
1508
1509
1510
1511 if (unlikely(fatal_signal_pending(current)))
1512 return i ? i : -ERESTARTSYS;
1513
1514 cond_resched();
1515 while (!(page = follow_page(vma, start, foll_flags))) {
1516 int ret;
1517 unsigned int fault_flags = 0;
1518
1519 if (foll_flags & FOLL_WRITE)
1520 fault_flags |= FAULT_FLAG_WRITE;
1521 if (nonblocking)
1522 fault_flags |= FAULT_FLAG_ALLOW_RETRY;
1523
1524 ret = handle_mm_fault(mm, vma, start,
1525 fault_flags);
1526
1527 if (ret & VM_FAULT_ERROR) {
1528 if (ret & VM_FAULT_OOM)
1529 return i ? i : -ENOMEM;
1530 if (ret &
1531 (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE|
1532 VM_FAULT_SIGBUS))
1533 return i ? i : -EFAULT;
1534 BUG();
1535 }
1536 if (ret & VM_FAULT_MAJOR)
1537 tsk->maj_flt++;
1538 else
1539 tsk->min_flt++;
1540
1541 if (ret & VM_FAULT_RETRY) {
1542 *nonblocking = 0;
1543 return i;
1544 }
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558 if ((ret & VM_FAULT_WRITE) &&
1559 !(vma->vm_flags & VM_WRITE))
1560 foll_flags &= ~FOLL_WRITE;
1561
1562 cond_resched();
1563 }
1564 if (IS_ERR(page))
1565 return i ? i : PTR_ERR(page);
1566 if (pages) {
1567 pages[i] = page;
1568
1569 flush_anon_page(vma, page, start);
1570 flush_dcache_page(page);
1571 }
1572 if (vmas)
1573 vmas[i] = vma;
1574 i++;
1575 start += PAGE_SIZE;
1576 nr_pages--;
1577 } while (nr_pages && start < vma->vm_end);
1578 } while (nr_pages);
1579 return i;
1580}
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1633 unsigned long start, int nr_pages, int write, int force,
1634 struct page **pages, struct vm_area_struct **vmas)
1635{
1636 int flags = FOLL_TOUCH;
1637
1638 if (pages)
1639 flags |= FOLL_GET;
1640 if (write)
1641 flags |= FOLL_WRITE;
1642 if (force)
1643 flags |= FOLL_FORCE;
1644
1645 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
1646 NULL);
1647}
1648EXPORT_SYMBOL(get_user_pages);
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664#ifdef CONFIG_ELF_CORE
1665struct page *get_dump_page(unsigned long addr)
1666{
1667 struct vm_area_struct *vma;
1668 struct page *page;
1669
1670 if (__get_user_pages(current, current->mm, addr, 1,
1671 FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
1672 NULL) < 1)
1673 return NULL;
1674 flush_cache_page(vma, addr, page_to_pfn(page));
1675 return page;
1676}
1677#endif
1678
1679pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
1680 spinlock_t **ptl)
1681{
1682 pgd_t * pgd = pgd_offset(mm, addr);
1683 pud_t * pud = pud_alloc(mm, pgd, addr);
1684 if (pud) {
1685 pmd_t * pmd = pmd_alloc(mm, pud, addr);
1686 if (pmd) {
1687 VM_BUG_ON(pmd_trans_huge(*pmd));
1688 return pte_alloc_map_lock(mm, pmd, addr, ptl);
1689 }
1690 }
1691 return NULL;
1692}
1693
1694
1695
1696
1697
1698
1699
1700
1701static int insert_page(struct vm_area_struct *vma, unsigned long addr,
1702 struct page *page, pgprot_t prot)
1703{
1704 struct mm_struct *mm = vma->vm_mm;
1705 int retval;
1706 pte_t *pte;
1707 spinlock_t *ptl;
1708
1709 retval = -EINVAL;
1710 if (PageAnon(page))
1711 goto out;
1712 retval = -ENOMEM;
1713 flush_dcache_page(page);
1714 pte = get_locked_pte(mm, addr, &ptl);
1715 if (!pte)
1716 goto out;
1717 retval = -EBUSY;
1718 if (!pte_none(*pte))
1719 goto out_unlock;
1720
1721
1722 get_page(page);
1723 inc_mm_counter_fast(mm, MM_FILEPAGES);
1724 page_add_file_rmap(page);
1725 set_pte_at(mm, addr, pte, mk_pte(page, prot));
1726
1727 retval = 0;
1728 pte_unmap_unlock(pte, ptl);
1729 return retval;
1730out_unlock:
1731 pte_unmap_unlock(pte, ptl);
1732out:
1733 return retval;
1734}
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
1759 struct page *page)
1760{
1761 if (addr < vma->vm_start || addr >= vma->vm_end)
1762 return -EFAULT;
1763 if (!page_count(page))
1764 return -EINVAL;
1765 vma->vm_flags |= VM_INSERTPAGE;
1766 return insert_page(vma, addr, page, vma->vm_page_prot);
1767}
1768EXPORT_SYMBOL(vm_insert_page);
1769
1770static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1771 unsigned long pfn, pgprot_t prot)
1772{
1773 struct mm_struct *mm = vma->vm_mm;
1774 int retval;
1775 pte_t *pte, entry;
1776 spinlock_t *ptl;
1777
1778 retval = -ENOMEM;
1779 pte = get_locked_pte(mm, addr, &ptl);
1780 if (!pte)
1781 goto out;
1782 retval = -EBUSY;
1783 if (!pte_none(*pte))
1784 goto out_unlock;
1785
1786
1787 entry = pte_mkspecial(pfn_pte(pfn, prot));
1788 set_pte_at(mm, addr, pte, entry);
1789 update_mmu_cache(vma, addr, pte);
1790
1791 retval = 0;
1792out_unlock:
1793 pte_unmap_unlock(pte, ptl);
1794out:
1795 return retval;
1796}
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1816 unsigned long pfn)
1817{
1818 int ret;
1819 pgprot_t pgprot = vma->vm_page_prot;
1820
1821
1822
1823
1824
1825
1826 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
1827 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
1828 (VM_PFNMAP|VM_MIXEDMAP));
1829 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
1830 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
1831
1832 if (addr < vma->vm_start || addr >= vma->vm_end)
1833 return -EFAULT;
1834 if (track_pfn_vma_new(vma, &pgprot, pfn, PAGE_SIZE))
1835 return -EINVAL;
1836
1837 ret = insert_pfn(vma, addr, pfn, pgprot);
1838
1839 if (ret)
1840 untrack_pfn_vma(vma, pfn, PAGE_SIZE);
1841
1842 return ret;
1843}
1844EXPORT_SYMBOL(vm_insert_pfn);
1845
1846int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
1847 unsigned long pfn)
1848{
1849 BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
1850
1851 if (addr < vma->vm_start || addr >= vma->vm_end)
1852 return -EFAULT;
1853
1854
1855
1856
1857
1858
1859
1860
1861 if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) {
1862 struct page *page;
1863
1864 page = pfn_to_page(pfn);
1865 return insert_page(vma, addr, page, vma->vm_page_prot);
1866 }
1867 return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
1868}
1869EXPORT_SYMBOL(vm_insert_mixed);
1870
1871
1872
1873
1874
1875
1876static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1877 unsigned long addr, unsigned long end,
1878 unsigned long pfn, pgprot_t prot)
1879{
1880 pte_t *pte;
1881 spinlock_t *ptl;
1882
1883 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
1884 if (!pte)
1885 return -ENOMEM;
1886 arch_enter_lazy_mmu_mode();
1887 do {
1888 BUG_ON(!pte_none(*pte));
1889 set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
1890 pfn++;
1891 } while (pte++, addr += PAGE_SIZE, addr != end);
1892 arch_leave_lazy_mmu_mode();
1893 pte_unmap_unlock(pte - 1, ptl);
1894 return 0;
1895}
1896
1897static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
1898 unsigned long addr, unsigned long end,
1899 unsigned long pfn, pgprot_t prot)
1900{
1901 pmd_t *pmd;
1902 unsigned long next;
1903
1904 pfn -= addr >> PAGE_SHIFT;
1905 pmd = pmd_alloc(mm, pud, addr);
1906 if (!pmd)
1907 return -ENOMEM;
1908 VM_BUG_ON(pmd_trans_huge(*pmd));
1909 do {
1910 next = pmd_addr_end(addr, end);
1911 if (remap_pte_range(mm, pmd, addr, next,
1912 pfn + (addr >> PAGE_SHIFT), prot))
1913 return -ENOMEM;
1914 } while (pmd++, addr = next, addr != end);
1915 return 0;
1916}
1917
1918static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
1919 unsigned long addr, unsigned long end,
1920 unsigned long pfn, pgprot_t prot)
1921{
1922 pud_t *pud;
1923 unsigned long next;
1924
1925 pfn -= addr >> PAGE_SHIFT;
1926 pud = pud_alloc(mm, pgd, addr);
1927 if (!pud)
1928 return -ENOMEM;
1929 do {
1930 next = pud_addr_end(addr, end);
1931 if (remap_pmd_range(mm, pud, addr, next,
1932 pfn + (addr >> PAGE_SHIFT), prot))
1933 return -ENOMEM;
1934 } while (pud++, addr = next, addr != end);
1935 return 0;
1936}
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1949 unsigned long pfn, unsigned long size, pgprot_t prot)
1950{
1951 pgd_t *pgd;
1952 unsigned long next;
1953 unsigned long end = addr + PAGE_ALIGN(size);
1954 struct mm_struct *mm = vma->vm_mm;
1955 int err;
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975 if (addr == vma->vm_start && end == vma->vm_end) {
1976 vma->vm_pgoff = pfn;
1977 vma->vm_flags |= VM_PFN_AT_MMAP;
1978 } else if (is_cow_mapping(vma->vm_flags))
1979 return -EINVAL;
1980
1981 vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
1982
1983 err = track_pfn_vma_new(vma, &prot, pfn, PAGE_ALIGN(size));
1984 if (err) {
1985
1986
1987
1988
1989 vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP);
1990 vma->vm_flags &= ~VM_PFN_AT_MMAP;
1991 return -EINVAL;
1992 }
1993
1994 BUG_ON(addr >= end);
1995 pfn -= addr >> PAGE_SHIFT;
1996 pgd = pgd_offset(mm, addr);
1997 flush_cache_range(vma, addr, end);
1998 do {
1999 next = pgd_addr_end(addr, end);
2000 err = remap_pud_range(mm, pgd, addr, next,
2001 pfn + (addr >> PAGE_SHIFT), prot);
2002 if (err)
2003 break;
2004 } while (pgd++, addr = next, addr != end);
2005
2006 if (err)
2007 untrack_pfn_vma(vma, pfn, PAGE_ALIGN(size));
2008
2009 return err;
2010}
2011EXPORT_SYMBOL(remap_pfn_range);
2012
2013static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
2014 unsigned long addr, unsigned long end,
2015 pte_fn_t fn, void *data)
2016{
2017 pte_t *pte;
2018 int err;
2019 pgtable_t token;
2020 spinlock_t *uninitialized_var(ptl);
2021
2022 pte = (mm == &init_mm) ?
2023 pte_alloc_kernel(pmd, addr) :
2024 pte_alloc_map_lock(mm, pmd, addr, &ptl);
2025 if (!pte)
2026 return -ENOMEM;
2027
2028 BUG_ON(pmd_huge(*pmd));
2029
2030 arch_enter_lazy_mmu_mode();
2031
2032 token = pmd_pgtable(*pmd);
2033
2034 do {
2035 err = fn(pte++, token, addr, data);
2036 if (err)
2037 break;
2038 } while (addr += PAGE_SIZE, addr != end);
2039
2040 arch_leave_lazy_mmu_mode();
2041
2042 if (mm != &init_mm)
2043 pte_unmap_unlock(pte-1, ptl);
2044 return err;
2045}
2046
2047static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
2048 unsigned long addr, unsigned long end,
2049 pte_fn_t fn, void *data)
2050{
2051 pmd_t *pmd;
2052 unsigned long next;
2053 int err;
2054
2055 BUG_ON(pud_huge(*pud));
2056
2057 pmd = pmd_alloc(mm, pud, addr);
2058 if (!pmd)
2059 return -ENOMEM;
2060 do {
2061 next = pmd_addr_end(addr, end);
2062 err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
2063 if (err)
2064 break;
2065 } while (pmd++, addr = next, addr != end);
2066 return err;
2067}
2068
2069static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd,
2070 unsigned long addr, unsigned long end,
2071 pte_fn_t fn, void *data)
2072{
2073 pud_t *pud;
2074 unsigned long next;
2075 int err;
2076
2077 pud = pud_alloc(mm, pgd, addr);
2078 if (!pud)
2079 return -ENOMEM;
2080 do {
2081 next = pud_addr_end(addr, end);
2082 err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
2083 if (err)
2084 break;
2085 } while (pud++, addr = next, addr != end);
2086 return err;
2087}
2088
2089
2090
2091
2092
2093int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
2094 unsigned long size, pte_fn_t fn, void *data)
2095{
2096 pgd_t *pgd;
2097 unsigned long next;
2098 unsigned long end = addr + size;
2099 int err;
2100
2101 BUG_ON(addr >= end);
2102 pgd = pgd_offset(mm, addr);
2103 do {
2104 next = pgd_addr_end(addr, end);
2105 err = apply_to_pud_range(mm, pgd, addr, next, fn, data);
2106 if (err)
2107 break;
2108 } while (pgd++, addr = next, addr != end);
2109
2110 return err;
2111}
2112EXPORT_SYMBOL_GPL(apply_to_page_range);
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
2124 pte_t *page_table, pte_t orig_pte)
2125{
2126 int same = 1;
2127#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
2128 if (sizeof(pte_t) > sizeof(unsigned long)) {
2129 spinlock_t *ptl = pte_lockptr(mm, pmd);
2130 spin_lock(ptl);
2131 same = pte_same(*page_table, orig_pte);
2132 spin_unlock(ptl);
2133 }
2134#endif
2135 pte_unmap(page_table);
2136 return same;
2137}
2138
2139static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
2140{
2141
2142
2143
2144
2145
2146
2147 if (unlikely(!src)) {
2148 void *kaddr = kmap_atomic(dst, KM_USER0);
2149 void __user *uaddr = (void __user *)(va & PAGE_MASK);
2150
2151
2152
2153
2154
2155
2156
2157 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
2158 clear_page(kaddr);
2159 kunmap_atomic(kaddr, KM_USER0);
2160 flush_dcache_page(dst);
2161 } else
2162 copy_user_highpage(dst, src, va, vma);
2163}
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2184 unsigned long address, pte_t *page_table, pmd_t *pmd,
2185 spinlock_t *ptl, pte_t orig_pte)
2186 __releases(ptl)
2187{
2188 struct page *old_page, *new_page;
2189 pte_t entry;
2190 int ret = 0;
2191 int page_mkwrite = 0;
2192 struct page *dirty_page = NULL;
2193
2194 old_page = vm_normal_page(vma, address, orig_pte);
2195 if (!old_page) {
2196
2197
2198
2199
2200
2201
2202
2203 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2204 (VM_WRITE|VM_SHARED))
2205 goto reuse;
2206 goto gotten;
2207 }
2208
2209
2210
2211
2212
2213 if (PageAnon(old_page) && !PageKsm(old_page)) {
2214 if (!trylock_page(old_page)) {
2215 page_cache_get(old_page);
2216 pte_unmap_unlock(page_table, ptl);
2217 lock_page(old_page);
2218 page_table = pte_offset_map_lock(mm, pmd, address,
2219 &ptl);
2220 if (!pte_same(*page_table, orig_pte)) {
2221 unlock_page(old_page);
2222 goto unlock;
2223 }
2224 page_cache_release(old_page);
2225 }
2226 if (reuse_swap_page(old_page)) {
2227
2228
2229
2230
2231
2232 page_move_anon_rmap(old_page, vma, address);
2233 unlock_page(old_page);
2234 goto reuse;
2235 }
2236 unlock_page(old_page);
2237 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2238 (VM_WRITE|VM_SHARED))) {
2239
2240
2241
2242
2243
2244 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
2245 struct vm_fault vmf;
2246 int tmp;
2247
2248 vmf.virtual_address = (void __user *)(address &
2249 PAGE_MASK);
2250 vmf.pgoff = old_page->index;
2251 vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
2252 vmf.page = old_page;
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262 page_cache_get(old_page);
2263 pte_unmap_unlock(page_table, ptl);
2264
2265 tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
2266 if (unlikely(tmp &
2267 (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
2268 ret = tmp;
2269 goto unwritable_page;
2270 }
2271 if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
2272 lock_page(old_page);
2273 if (!old_page->mapping) {
2274 ret = 0;
2275 unlock_page(old_page);
2276 goto unwritable_page;
2277 }
2278 } else
2279 VM_BUG_ON(!PageLocked(old_page));
2280
2281
2282
2283
2284
2285
2286
2287 page_table = pte_offset_map_lock(mm, pmd, address,
2288 &ptl);
2289 if (!pte_same(*page_table, orig_pte)) {
2290 unlock_page(old_page);
2291 goto unlock;
2292 }
2293
2294 page_mkwrite = 1;
2295 }
2296 dirty_page = old_page;
2297 get_page(dirty_page);
2298
2299reuse:
2300 flush_cache_page(vma, address, pte_pfn(orig_pte));
2301 entry = pte_mkyoung(orig_pte);
2302 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2303 if (ptep_set_access_flags(vma, address, page_table, entry,1))
2304 update_mmu_cache(vma, address, page_table);
2305 pte_unmap_unlock(page_table, ptl);
2306 ret |= VM_FAULT_WRITE;
2307
2308 if (!dirty_page)
2309 return ret;
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319 if (!page_mkwrite) {
2320 wait_on_page_locked(dirty_page);
2321 set_page_dirty_balance(dirty_page, page_mkwrite);
2322 }
2323 put_page(dirty_page);
2324 if (page_mkwrite) {
2325 struct address_space *mapping = dirty_page->mapping;
2326
2327 set_page_dirty(dirty_page);
2328 unlock_page(dirty_page);
2329 page_cache_release(dirty_page);
2330 if (mapping) {
2331
2332
2333
2334
2335 balance_dirty_pages_ratelimited(mapping);
2336 }
2337 }
2338
2339
2340 if (vma->vm_file)
2341 file_update_time(vma->vm_file);
2342
2343 return ret;
2344 }
2345
2346
2347
2348
2349 page_cache_get(old_page);
2350gotten:
2351 pte_unmap_unlock(page_table, ptl);
2352
2353 if (unlikely(anon_vma_prepare(vma)))
2354 goto oom;
2355
2356 if (is_zero_pfn(pte_pfn(orig_pte))) {
2357 new_page = alloc_zeroed_user_highpage_movable(vma, address);
2358 if (!new_page)
2359 goto oom;
2360 } else {
2361 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
2362 if (!new_page)
2363 goto oom;
2364 cow_user_page(new_page, old_page, address, vma);
2365 }
2366 __SetPageUptodate(new_page);
2367
2368 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
2369 goto oom_free_new;
2370
2371
2372
2373
2374 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2375 if (likely(pte_same(*page_table, orig_pte))) {
2376 if (old_page) {
2377 if (!PageAnon(old_page)) {
2378 dec_mm_counter_fast(mm, MM_FILEPAGES);
2379 inc_mm_counter_fast(mm, MM_ANONPAGES);
2380 }
2381 } else
2382 inc_mm_counter_fast(mm, MM_ANONPAGES);
2383 flush_cache_page(vma, address, pte_pfn(orig_pte));
2384 entry = mk_pte(new_page, vma->vm_page_prot);
2385 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2386
2387
2388
2389
2390
2391
2392 ptep_clear_flush(vma, address, page_table);
2393 page_add_new_anon_rmap(new_page, vma, address);
2394
2395
2396
2397
2398
2399 set_pte_at_notify(mm, address, page_table, entry);
2400 update_mmu_cache(vma, address, page_table);
2401 if (old_page) {
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424 page_remove_rmap(old_page);
2425 }
2426
2427
2428 new_page = old_page;
2429 ret |= VM_FAULT_WRITE;
2430 } else
2431 mem_cgroup_uncharge_page(new_page);
2432
2433 if (new_page)
2434 page_cache_release(new_page);
2435unlock:
2436 pte_unmap_unlock(page_table, ptl);
2437 if (old_page) {
2438
2439
2440
2441
2442 if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) {
2443 lock_page(old_page);
2444 munlock_vma_page(old_page);
2445 unlock_page(old_page);
2446 }
2447 page_cache_release(old_page);
2448 }
2449 return ret;
2450oom_free_new:
2451 page_cache_release(new_page);
2452oom:
2453 if (old_page) {
2454 if (page_mkwrite) {
2455 unlock_page(old_page);
2456 page_cache_release(old_page);
2457 }
2458 page_cache_release(old_page);
2459 }
2460 return VM_FAULT_OOM;
2461
2462unwritable_page:
2463 page_cache_release(old_page);
2464 return ret;
2465}
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499#define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK))
2500
2501static void reset_vma_truncate_counts(struct address_space *mapping)
2502{
2503 struct vm_area_struct *vma;
2504 struct prio_tree_iter iter;
2505
2506 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
2507 vma->vm_truncate_count = 0;
2508 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
2509 vma->vm_truncate_count = 0;
2510}
2511
2512static int unmap_mapping_range_vma(struct vm_area_struct *vma,
2513 unsigned long start_addr, unsigned long end_addr,
2514 struct zap_details *details)
2515{
2516 unsigned long restart_addr;
2517 int need_break;
2518
2519
2520
2521
2522
2523
2524
2525
2526again:
2527 restart_addr = vma->vm_truncate_count;
2528 if (is_restart_addr(restart_addr) && start_addr < restart_addr) {
2529 start_addr = restart_addr;
2530 if (start_addr >= end_addr) {
2531
2532 vma->vm_truncate_count = details->truncate_count;
2533 return 0;
2534 }
2535 }
2536
2537 restart_addr = zap_page_range(vma, start_addr,
2538 end_addr - start_addr, details);
2539 need_break = need_resched() || spin_needbreak(details->i_mmap_lock);
2540
2541 if (restart_addr >= end_addr) {
2542
2543 vma->vm_truncate_count = details->truncate_count;
2544 if (!need_break)
2545 return 0;
2546 } else {
2547
2548 vma->vm_truncate_count = restart_addr;
2549 if (!need_break)
2550 goto again;
2551 }
2552
2553 spin_unlock(details->i_mmap_lock);
2554 cond_resched();
2555 spin_lock(details->i_mmap_lock);
2556 return -EINTR;
2557}
2558
2559static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
2560 struct zap_details *details)
2561{
2562 struct vm_area_struct *vma;
2563 struct prio_tree_iter iter;
2564 pgoff_t vba, vea, zba, zea;
2565
2566restart:
2567 vma_prio_tree_foreach(vma, &iter, root,
2568 details->first_index, details->last_index) {
2569
2570 if (vma->vm_truncate_count == details->truncate_count)
2571 continue;
2572
2573 vba = vma->vm_pgoff;
2574 vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;
2575
2576 zba = details->first_index;
2577 if (zba < vba)
2578 zba = vba;
2579 zea = details->last_index;
2580 if (zea > vea)
2581 zea = vea;
2582
2583 if (unmap_mapping_range_vma(vma,
2584 ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
2585 ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
2586 details) < 0)
2587 goto restart;
2588 }
2589}
2590
2591static inline void unmap_mapping_range_list(struct list_head *head,
2592 struct zap_details *details)
2593{
2594 struct vm_area_struct *vma;
2595
2596
2597
2598
2599
2600
2601
2602restart:
2603 list_for_each_entry(vma, head, shared.vm_set.list) {
2604
2605 if (vma->vm_truncate_count == details->truncate_count)
2606 continue;
2607 details->nonlinear_vma = vma;
2608 if (unmap_mapping_range_vma(vma, vma->vm_start,
2609 vma->vm_end, details) < 0)
2610 goto restart;
2611 }
2612}
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628void unmap_mapping_range(struct address_space *mapping,
2629 loff_t const holebegin, loff_t const holelen, int even_cows)
2630{
2631 struct zap_details details;
2632 pgoff_t hba = holebegin >> PAGE_SHIFT;
2633 pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2634
2635
2636 if (sizeof(holelen) > sizeof(hlen)) {
2637 long long holeend =
2638 (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2639 if (holeend & ~(long long)ULONG_MAX)
2640 hlen = ULONG_MAX - hba + 1;
2641 }
2642
2643 details.check_mapping = even_cows? NULL: mapping;
2644 details.nonlinear_vma = NULL;
2645 details.first_index = hba;
2646 details.last_index = hba + hlen - 1;
2647 if (details.last_index < details.first_index)
2648 details.last_index = ULONG_MAX;
2649 details.i_mmap_lock = &mapping->i_mmap_lock;
2650
2651 mutex_lock(&mapping->unmap_mutex);
2652 spin_lock(&mapping->i_mmap_lock);
2653
2654
2655 mapping->truncate_count++;
2656 if (unlikely(is_restart_addr(mapping->truncate_count))) {
2657 if (mapping->truncate_count == 0)
2658 reset_vma_truncate_counts(mapping);
2659 mapping->truncate_count++;
2660 }
2661 details.truncate_count = mapping->truncate_count;
2662
2663 if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
2664 unmap_mapping_range_tree(&mapping->i_mmap, &details);
2665 if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
2666 unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
2667 spin_unlock(&mapping->i_mmap_lock);
2668 mutex_unlock(&mapping->unmap_mutex);
2669}
2670EXPORT_SYMBOL(unmap_mapping_range);
2671
2672int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
2673{
2674 struct address_space *mapping = inode->i_mapping;
2675
2676
2677
2678
2679
2680
2681 if (!inode->i_op->truncate_range)
2682 return -ENOSYS;
2683
2684 mutex_lock(&inode->i_mutex);
2685 down_write(&inode->i_alloc_sem);
2686 unmap_mapping_range(mapping, offset, (end - offset), 1);
2687 truncate_inode_pages_range(mapping, offset, end);
2688 unmap_mapping_range(mapping, offset, (end - offset), 1);
2689 inode->i_op->truncate_range(inode, offset, end);
2690 up_write(&inode->i_alloc_sem);
2691 mutex_unlock(&inode->i_mutex);
2692
2693 return 0;
2694}
2695
2696
2697
2698
2699
2700
2701static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2702 unsigned long address, pte_t *page_table, pmd_t *pmd,
2703 unsigned int flags, pte_t orig_pte)
2704{
2705 spinlock_t *ptl;
2706 struct page *page, *swapcache = NULL;
2707 swp_entry_t entry;
2708 pte_t pte;
2709 int locked;
2710 struct mem_cgroup *ptr = NULL;
2711 int exclusive = 0;
2712 int ret = 0;
2713
2714 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
2715 goto out;
2716
2717 entry = pte_to_swp_entry(orig_pte);
2718 if (unlikely(non_swap_entry(entry))) {
2719 if (is_migration_entry(entry)) {
2720 migration_entry_wait(mm, pmd, address);
2721 } else if (is_hwpoison_entry(entry)) {
2722 ret = VM_FAULT_HWPOISON;
2723 } else {
2724 print_bad_pte(vma, address, orig_pte, NULL);
2725 ret = VM_FAULT_SIGBUS;
2726 }
2727 goto out;
2728 }
2729 delayacct_set_flag(DELAYACCT_PF_SWAPIN);
2730 page = lookup_swap_cache(entry);
2731 if (!page) {
2732 grab_swap_token(mm);
2733 page = swapin_readahead(entry,
2734 GFP_HIGHUSER_MOVABLE, vma, address);
2735 if (!page) {
2736
2737
2738
2739
2740 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2741 if (likely(pte_same(*page_table, orig_pte)))
2742 ret = VM_FAULT_OOM;
2743 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2744 goto unlock;
2745 }
2746
2747
2748 ret = VM_FAULT_MAJOR;
2749 count_vm_event(PGMAJFAULT);
2750 } else if (PageHWPoison(page)) {
2751
2752
2753
2754
2755 ret = VM_FAULT_HWPOISON;
2756 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2757 goto out_release;
2758 }
2759
2760 locked = lock_page_or_retry(page, mm, flags);
2761 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2762 if (!locked) {
2763 ret |= VM_FAULT_RETRY;
2764 goto out_release;
2765 }
2766
2767
2768
2769
2770
2771
2772
2773 if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
2774 goto out_page;
2775
2776 if (ksm_might_need_to_copy(page, vma, address)) {
2777 swapcache = page;
2778 page = ksm_does_need_to_copy(page, vma, address);
2779
2780 if (unlikely(!page)) {
2781 ret = VM_FAULT_OOM;
2782 page = swapcache;
2783 swapcache = NULL;
2784 goto out_page;
2785 }
2786 }
2787
2788 if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
2789 ret = VM_FAULT_OOM;
2790 goto out_page;
2791 }
2792
2793
2794
2795
2796 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2797 if (unlikely(!pte_same(*page_table, orig_pte)))
2798 goto out_nomap;
2799
2800 if (unlikely(!PageUptodate(page))) {
2801 ret = VM_FAULT_SIGBUS;
2802 goto out_nomap;
2803 }
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819 inc_mm_counter_fast(mm, MM_ANONPAGES);
2820 dec_mm_counter_fast(mm, MM_SWAPENTS);
2821 pte = mk_pte(page, vma->vm_page_prot);
2822 if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
2823 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
2824 flags &= ~FAULT_FLAG_WRITE;
2825 ret |= VM_FAULT_WRITE;
2826 exclusive = 1;
2827 }
2828 flush_icache_page(vma, page);
2829 set_pte_at(mm, address, page_table, pte);
2830 do_page_add_anon_rmap(page, vma, address, exclusive);
2831
2832 mem_cgroup_commit_charge_swapin(page, ptr);
2833
2834 swap_free(entry);
2835 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
2836 try_to_free_swap(page);
2837 unlock_page(page);
2838 if (swapcache) {
2839
2840
2841
2842
2843
2844
2845
2846
2847 unlock_page(swapcache);
2848 page_cache_release(swapcache);
2849 }
2850
2851 if (flags & FAULT_FLAG_WRITE) {
2852 ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
2853 if (ret & VM_FAULT_ERROR)
2854 ret &= VM_FAULT_ERROR;
2855 goto out;
2856 }
2857
2858
2859 update_mmu_cache(vma, address, page_table);
2860unlock:
2861 pte_unmap_unlock(page_table, ptl);
2862out:
2863 return ret;
2864out_nomap:
2865 mem_cgroup_cancel_charge_swapin(ptr);
2866 pte_unmap_unlock(page_table, ptl);
2867out_page:
2868 unlock_page(page);
2869out_release:
2870 page_cache_release(page);
2871 if (swapcache) {
2872 unlock_page(swapcache);
2873 page_cache_release(swapcache);
2874 }
2875 return ret;
2876}
2877
2878
2879
2880
2881
2882
2883static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address)
2884{
2885 address &= PAGE_MASK;
2886 if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) {
2887 struct vm_area_struct *prev = vma->vm_prev;
2888
2889
2890
2891
2892
2893
2894
2895 if (prev && prev->vm_end == address)
2896 return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM;
2897
2898 expand_stack(vma, address - PAGE_SIZE);
2899 }
2900 if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) {
2901 struct vm_area_struct *next = vma->vm_next;
2902
2903
2904 if (next && next->vm_start == address + PAGE_SIZE)
2905 return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM;
2906
2907 expand_upwards(vma, address + PAGE_SIZE);
2908 }
2909 return 0;
2910}
2911
2912
2913
2914
2915
2916
2917static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2918 unsigned long address, pte_t *page_table, pmd_t *pmd,
2919 unsigned int flags)
2920{
2921 struct page *page;
2922 spinlock_t *ptl;
2923 pte_t entry;
2924
2925 pte_unmap(page_table);
2926
2927
2928 if (check_stack_guard_page(vma, address) < 0)
2929 return VM_FAULT_SIGBUS;
2930
2931
2932 if (!(flags & FAULT_FLAG_WRITE)) {
2933 entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
2934 vma->vm_page_prot));
2935 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2936 if (!pte_none(*page_table))
2937 goto unlock;
2938 goto setpte;
2939 }
2940
2941
2942 if (unlikely(anon_vma_prepare(vma)))
2943 goto oom;
2944 page = alloc_zeroed_user_highpage_movable(vma, address);
2945 if (!page)
2946 goto oom;
2947 __SetPageUptodate(page);
2948
2949 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))
2950 goto oom_free_page;
2951
2952 entry = mk_pte(page, vma->vm_page_prot);
2953 if (vma->vm_flags & VM_WRITE)
2954 entry = pte_mkwrite(pte_mkdirty(entry));
2955
2956 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2957 if (!pte_none(*page_table))
2958 goto release;
2959
2960 inc_mm_counter_fast(mm, MM_ANONPAGES);
2961 page_add_new_anon_rmap(page, vma, address);
2962setpte:
2963 set_pte_at(mm, address, page_table, entry);
2964
2965
2966 update_mmu_cache(vma, address, page_table);
2967unlock:
2968 pte_unmap_unlock(page_table, ptl);
2969 return 0;
2970release:
2971 mem_cgroup_uncharge_page(page);
2972 page_cache_release(page);
2973 goto unlock;
2974oom_free_page:
2975 page_cache_release(page);
2976oom:
2977 return VM_FAULT_OOM;
2978}
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2994 unsigned long address, pmd_t *pmd,
2995 pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
2996{
2997 pte_t *page_table;
2998 spinlock_t *ptl;
2999 struct page *page;
3000 pte_t entry;
3001 int anon = 0;
3002 int charged = 0;
3003 struct page *dirty_page = NULL;
3004 struct vm_fault vmf;
3005 int ret;
3006 int page_mkwrite = 0;
3007
3008 vmf.virtual_address = (void __user *)(address & PAGE_MASK);
3009 vmf.pgoff = pgoff;
3010 vmf.flags = flags;
3011 vmf.page = NULL;
3012
3013 ret = vma->vm_ops->fault(vma, &vmf);
3014 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
3015 VM_FAULT_RETRY)))
3016 return ret;
3017
3018 if (unlikely(PageHWPoison(vmf.page))) {
3019 if (ret & VM_FAULT_LOCKED)
3020 unlock_page(vmf.page);
3021 return VM_FAULT_HWPOISON;
3022 }
3023
3024
3025
3026
3027
3028 if (unlikely(!(ret & VM_FAULT_LOCKED)))
3029 lock_page(vmf.page);
3030 else
3031 VM_BUG_ON(!PageLocked(vmf.page));
3032
3033
3034
3035
3036 page = vmf.page;
3037 if (flags & FAULT_FLAG_WRITE) {
3038 if (!(vma->vm_flags & VM_SHARED)) {
3039 anon = 1;
3040 if (unlikely(anon_vma_prepare(vma))) {
3041 ret = VM_FAULT_OOM;
3042 goto out;
3043 }
3044 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
3045 vma, address);
3046 if (!page) {
3047 ret = VM_FAULT_OOM;
3048 goto out;
3049 }
3050 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
3051 ret = VM_FAULT_OOM;
3052 page_cache_release(page);
3053 goto out;
3054 }
3055 charged = 1;
3056 copy_user_highpage(page, vmf.page, address, vma);
3057 __SetPageUptodate(page);
3058 } else {
3059
3060
3061
3062
3063
3064 if (vma->vm_ops->page_mkwrite) {
3065 int tmp;
3066
3067 unlock_page(page);
3068 vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
3069 tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
3070 if (unlikely(tmp &
3071 (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
3072 ret = tmp;
3073 goto unwritable_page;
3074 }
3075 if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
3076 lock_page(page);
3077 if (!page->mapping) {
3078 ret = 0;
3079 unlock_page(page);
3080 goto unwritable_page;
3081 }
3082 } else
3083 VM_BUG_ON(!PageLocked(page));
3084 page_mkwrite = 1;
3085 }
3086 }
3087
3088 }
3089
3090 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103 if (likely(pte_same(*page_table, orig_pte))) {
3104 flush_icache_page(vma, page);
3105 entry = mk_pte(page, vma->vm_page_prot);
3106 if (flags & FAULT_FLAG_WRITE)
3107 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
3108 if (anon) {
3109 inc_mm_counter_fast(mm, MM_ANONPAGES);
3110 page_add_new_anon_rmap(page, vma, address);
3111 } else {
3112 inc_mm_counter_fast(mm, MM_FILEPAGES);
3113 page_add_file_rmap(page);
3114 if (flags & FAULT_FLAG_WRITE) {
3115 dirty_page = page;
3116 get_page(dirty_page);
3117 }
3118 }
3119 set_pte_at(mm, address, page_table, entry);
3120
3121
3122 update_mmu_cache(vma, address, page_table);
3123 } else {
3124 if (charged)
3125 mem_cgroup_uncharge_page(page);
3126 if (anon)
3127 page_cache_release(page);
3128 else
3129 anon = 1;
3130 }
3131
3132 pte_unmap_unlock(page_table, ptl);
3133
3134out:
3135 if (dirty_page) {
3136 struct address_space *mapping = page->mapping;
3137
3138 if (set_page_dirty(dirty_page))
3139 page_mkwrite = 1;
3140 unlock_page(dirty_page);
3141 put_page(dirty_page);
3142 if (page_mkwrite && mapping) {
3143
3144
3145
3146
3147 balance_dirty_pages_ratelimited(mapping);
3148 }
3149
3150
3151 if (vma->vm_file)
3152 file_update_time(vma->vm_file);
3153 } else {
3154 unlock_page(vmf.page);
3155 if (anon)
3156 page_cache_release(vmf.page);
3157 }
3158
3159 return ret;
3160
3161unwritable_page:
3162 page_cache_release(page);
3163 return ret;
3164}
3165
3166static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3167 unsigned long address, pte_t *page_table, pmd_t *pmd,
3168 unsigned int flags, pte_t orig_pte)
3169{
3170 pgoff_t pgoff = (((address & PAGE_MASK)
3171 - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
3172
3173 pte_unmap(page_table);
3174 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
3175}
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3187 unsigned long address, pte_t *page_table, pmd_t *pmd,
3188 unsigned int flags, pte_t orig_pte)
3189{
3190 pgoff_t pgoff;
3191
3192 flags |= FAULT_FLAG_NONLINEAR;
3193
3194 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
3195 return 0;
3196
3197 if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
3198
3199
3200
3201 print_bad_pte(vma, address, orig_pte, NULL);
3202 return VM_FAULT_SIGBUS;
3203 }
3204
3205 pgoff = pte_to_pgoff(orig_pte);
3206 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
3207}
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222int handle_pte_fault(struct mm_struct *mm,
3223 struct vm_area_struct *vma, unsigned long address,
3224 pte_t *pte, pmd_t *pmd, unsigned int flags)
3225{
3226 pte_t entry;
3227 spinlock_t *ptl;
3228
3229 entry = *pte;
3230 if (!pte_present(entry)) {
3231 if (pte_none(entry)) {
3232 if (vma->vm_ops) {
3233 if (likely(vma->vm_ops->fault))
3234 return do_linear_fault(mm, vma, address,
3235 pte, pmd, flags, entry);
3236 }
3237 return do_anonymous_page(mm, vma, address,
3238 pte, pmd, flags);
3239 }
3240 if (pte_file(entry))
3241 return do_nonlinear_fault(mm, vma, address,
3242 pte, pmd, flags, entry);
3243 return do_swap_page(mm, vma, address,
3244 pte, pmd, flags, entry);
3245 }
3246
3247 ptl = pte_lockptr(mm, pmd);
3248 spin_lock(ptl);
3249 if (unlikely(!pte_same(*pte, entry)))
3250 goto unlock;
3251 if (flags & FAULT_FLAG_WRITE) {
3252 if (!pte_write(entry))
3253 return do_wp_page(mm, vma, address,
3254 pte, pmd, ptl, entry);
3255 entry = pte_mkdirty(entry);
3256 }
3257 entry = pte_mkyoung(entry);
3258 if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
3259 update_mmu_cache(vma, address, pte);
3260 } else {
3261
3262
3263
3264
3265
3266
3267 if (flags & FAULT_FLAG_WRITE)
3268 flush_tlb_fix_spurious_fault(vma, address);
3269 }
3270unlock:
3271 pte_unmap_unlock(pte, ptl);
3272 return 0;
3273}
3274
3275
3276
3277
3278int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3279 unsigned long address, unsigned int flags)
3280{
3281 pgd_t *pgd;
3282 pud_t *pud;
3283 pmd_t *pmd;
3284 pte_t *pte;
3285
3286 __set_current_state(TASK_RUNNING);
3287
3288 count_vm_event(PGFAULT);
3289
3290
3291 check_sync_rss_stat(current);
3292
3293 if (unlikely(is_vm_hugetlb_page(vma)))
3294 return hugetlb_fault(mm, vma, address, flags);
3295
3296 pgd = pgd_offset(mm, address);
3297 pud = pud_alloc(mm, pgd, address);
3298 if (!pud)
3299 return VM_FAULT_OOM;
3300 pmd = pmd_alloc(mm, pud, address);
3301 if (!pmd)
3302 return VM_FAULT_OOM;
3303 if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
3304 if (!vma->vm_ops)
3305 return do_huge_pmd_anonymous_page(mm, vma, address,
3306 pmd, flags);
3307 } else {
3308 pmd_t orig_pmd = *pmd;
3309 barrier();
3310 if (pmd_trans_huge(orig_pmd)) {
3311 if (flags & FAULT_FLAG_WRITE &&
3312 !pmd_write(orig_pmd) &&
3313 !pmd_trans_splitting(orig_pmd))
3314 return do_huge_pmd_wp_page(mm, vma, address,
3315 pmd, orig_pmd);
3316 return 0;
3317 }
3318 }
3319
3320
3321
3322
3323
3324
3325 if (unlikely(__pte_alloc(mm, vma, pmd, address)))
3326 return VM_FAULT_OOM;
3327
3328 if (unlikely(pmd_trans_huge(*pmd)))
3329 return 0;
3330
3331
3332
3333
3334
3335
3336 pte = pte_offset_map(pmd, address);
3337
3338 return handle_pte_fault(mm, vma, address, pte, pmd, flags);
3339}
3340
3341#ifndef __PAGETABLE_PUD_FOLDED
3342
3343
3344
3345
3346int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
3347{
3348 pud_t *new = pud_alloc_one(mm, address);
3349 if (!new)
3350 return -ENOMEM;
3351
3352 smp_wmb();
3353
3354 spin_lock(&mm->page_table_lock);
3355 if (pgd_present(*pgd))
3356 pud_free(mm, new);
3357 else
3358 pgd_populate(mm, pgd, new);
3359 spin_unlock(&mm->page_table_lock);
3360 return 0;
3361}
3362#endif
3363
3364#ifndef __PAGETABLE_PMD_FOLDED
3365
3366
3367
3368
3369int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
3370{
3371 pmd_t *new = pmd_alloc_one(mm, address);
3372 if (!new)
3373 return -ENOMEM;
3374
3375 smp_wmb();
3376
3377 spin_lock(&mm->page_table_lock);
3378#ifndef __ARCH_HAS_4LEVEL_HACK
3379 if (pud_present(*pud))
3380 pmd_free(mm, new);
3381 else
3382 pud_populate(mm, pud, new);
3383#else
3384 if (pgd_present(*pud))
3385 pmd_free(mm, new);
3386 else
3387 pgd_populate(mm, pud, new);
3388#endif
3389 spin_unlock(&mm->page_table_lock);
3390 return 0;
3391}
3392#endif
3393
3394int make_pages_present(unsigned long addr, unsigned long end)
3395{
3396 int ret, len, write;
3397 struct vm_area_struct * vma;
3398
3399 vma = find_vma(current->mm, addr);
3400 if (!vma)
3401 return -ENOMEM;
3402
3403
3404
3405
3406
3407 write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE;
3408 BUG_ON(addr >= end);
3409 BUG_ON(end > vma->vm_end);
3410 len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
3411 ret = get_user_pages(current, current->mm, addr,
3412 len, write, 0, NULL, NULL);
3413 if (ret < 0)
3414 return ret;
3415 return ret == len ? 0 : -EFAULT;
3416}
3417
3418#if !defined(__HAVE_ARCH_GATE_AREA)
3419
3420#if defined(AT_SYSINFO_EHDR)
3421static struct vm_area_struct gate_vma;
3422
3423static int __init gate_vma_init(void)
3424{
3425 gate_vma.vm_mm = NULL;
3426 gate_vma.vm_start = FIXADDR_USER_START;
3427 gate_vma.vm_end = FIXADDR_USER_END;
3428 gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
3429 gate_vma.vm_page_prot = __P101;
3430
3431
3432
3433
3434
3435
3436 gate_vma.vm_flags |= VM_ALWAYSDUMP;
3437 return 0;
3438}
3439__initcall(gate_vma_init);
3440#endif
3441
3442struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
3443{
3444#ifdef AT_SYSINFO_EHDR
3445 return &gate_vma;
3446#else
3447 return NULL;
3448#endif
3449}
3450
3451int in_gate_area_no_task(unsigned long addr)
3452{
3453#ifdef AT_SYSINFO_EHDR
3454 if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END))
3455 return 1;
3456#endif
3457 return 0;
3458}
3459
3460#endif
3461
3462static int __follow_pte(struct mm_struct *mm, unsigned long address,
3463 pte_t **ptepp, spinlock_t **ptlp)
3464{
3465 pgd_t *pgd;
3466 pud_t *pud;
3467 pmd_t *pmd;
3468 pte_t *ptep;
3469
3470 pgd = pgd_offset(mm, address);
3471 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
3472 goto out;
3473
3474 pud = pud_offset(pgd, address);
3475 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
3476 goto out;
3477
3478 pmd = pmd_offset(pud, address);
3479 VM_BUG_ON(pmd_trans_huge(*pmd));
3480 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
3481 goto out;
3482
3483
3484 if (pmd_huge(*pmd))
3485 goto out;
3486
3487 ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
3488 if (!ptep)
3489 goto out;
3490 if (!pte_present(*ptep))
3491 goto unlock;
3492 *ptepp = ptep;
3493 return 0;
3494unlock:
3495 pte_unmap_unlock(ptep, *ptlp);
3496out:
3497 return -EINVAL;
3498}
3499
3500static inline int follow_pte(struct mm_struct *mm, unsigned long address,
3501 pte_t **ptepp, spinlock_t **ptlp)
3502{
3503 int res;
3504
3505
3506 (void) __cond_lock(*ptlp,
3507 !(res = __follow_pte(mm, address, ptepp, ptlp)));
3508 return res;
3509}
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521int follow_pfn(struct vm_area_struct *vma, unsigned long address,
3522 unsigned long *pfn)
3523{
3524 int ret = -EINVAL;
3525 spinlock_t *ptl;
3526 pte_t *ptep;
3527
3528 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
3529 return ret;
3530
3531 ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
3532 if (ret)
3533 return ret;
3534 *pfn = pte_pfn(*ptep);
3535 pte_unmap_unlock(ptep, ptl);
3536 return 0;
3537}
3538EXPORT_SYMBOL(follow_pfn);
3539
3540#ifdef CONFIG_HAVE_IOREMAP_PROT
3541int follow_phys(struct vm_area_struct *vma,
3542 unsigned long address, unsigned int flags,
3543 unsigned long *prot, resource_size_t *phys)
3544{
3545 int ret = -EINVAL;
3546 pte_t *ptep, pte;
3547 spinlock_t *ptl;
3548
3549 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
3550 goto out;
3551
3552 if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
3553 goto out;
3554 pte = *ptep;
3555
3556 if ((flags & FOLL_WRITE) && !pte_write(pte))
3557 goto unlock;
3558
3559 *prot = pgprot_val(pte_pgprot(pte));
3560 *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
3561
3562 ret = 0;
3563unlock:
3564 pte_unmap_unlock(ptep, ptl);
3565out:
3566 return ret;
3567}
3568
3569int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
3570 void *buf, int len, int write)
3571{
3572 resource_size_t phys_addr;
3573 unsigned long prot = 0;
3574 void __iomem *maddr;
3575 int offset = addr & (PAGE_SIZE-1);
3576
3577 if (follow_phys(vma, addr, write, &prot, &phys_addr))
3578 return -EINVAL;
3579
3580 maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot);
3581 if (write)
3582 memcpy_toio(maddr + offset, buf, len);
3583 else
3584 memcpy_fromio(buf, maddr + offset, len);
3585 iounmap(maddr);
3586
3587 return len;
3588}
3589#endif
3590
3591
3592
3593
3594
3595
3596int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
3597{
3598 struct mm_struct *mm;
3599 struct vm_area_struct *vma;
3600 void *old_buf = buf;
3601
3602 mm = get_task_mm(tsk);
3603 if (!mm)
3604 return 0;
3605
3606 down_read(&mm->mmap_sem);
3607
3608 while (len) {
3609 int bytes, ret, offset;
3610 void *maddr;
3611 struct page *page = NULL;
3612
3613 ret = get_user_pages(tsk, mm, addr, 1,
3614 write, 1, &page, &vma);
3615 if (ret <= 0) {
3616
3617
3618
3619
3620#ifdef CONFIG_HAVE_IOREMAP_PROT
3621 vma = find_vma(mm, addr);
3622 if (!vma)
3623 break;
3624 if (vma->vm_ops && vma->vm_ops->access)
3625 ret = vma->vm_ops->access(vma, addr, buf,
3626 len, write);
3627 if (ret <= 0)
3628#endif
3629 break;
3630 bytes = ret;
3631 } else {
3632 bytes = len;
3633 offset = addr & (PAGE_SIZE-1);
3634 if (bytes > PAGE_SIZE-offset)
3635 bytes = PAGE_SIZE-offset;
3636
3637 maddr = kmap(page);
3638 if (write) {
3639 copy_to_user_page(vma, page, addr,
3640 maddr + offset, buf, bytes);
3641 set_page_dirty_lock(page);
3642 } else {
3643 copy_from_user_page(vma, page, addr,
3644 buf, maddr + offset, bytes);
3645 }
3646 kunmap(page);
3647 page_cache_release(page);
3648 }
3649 len -= bytes;
3650 buf += bytes;
3651 addr += bytes;
3652 }
3653 up_read(&mm->mmap_sem);
3654 mmput(mm);
3655
3656 return buf - old_buf;
3657}
3658
3659
3660
3661
3662void print_vma_addr(char *prefix, unsigned long ip)
3663{
3664 struct mm_struct *mm = current->mm;
3665 struct vm_area_struct *vma;
3666
3667
3668
3669
3670
3671 if (preempt_count())
3672 return;
3673
3674 down_read(&mm->mmap_sem);
3675 vma = find_vma(mm, ip);
3676 if (vma && vma->vm_file) {
3677 struct file *f = vma->vm_file;
3678 char *buf = (char *)__get_free_page(GFP_KERNEL);
3679 if (buf) {
3680 char *p, *s;
3681
3682 p = d_path(&f->f_path, buf, PAGE_SIZE);
3683 if (IS_ERR(p))
3684 p = "?";
3685 s = strrchr(p, '/');
3686 if (s)
3687 p = s+1;
3688 printk("%s%s[%lx+%lx]", prefix, p,
3689 vma->vm_start,
3690 vma->vm_end - vma->vm_start);
3691 free_page((unsigned long)buf);
3692 }
3693 }
3694 up_read(¤t->mm->mmap_sem);
3695}
3696
3697#ifdef CONFIG_PROVE_LOCKING
3698void might_fault(void)
3699{
3700
3701
3702
3703
3704
3705
3706 if (segment_eq(get_fs(), KERNEL_DS))
3707 return;
3708
3709 might_sleep();
3710
3711
3712
3713
3714
3715 if (!in_atomic() && current->mm)
3716 might_lock_read(¤t->mm->mmap_sem);
3717}
3718EXPORT_SYMBOL(might_fault);
3719#endif
3720
3721#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
3722static void clear_gigantic_page(struct page *page,
3723 unsigned long addr,
3724 unsigned int pages_per_huge_page)
3725{
3726 int i;
3727 struct page *p = page;
3728
3729 might_sleep();
3730 for (i = 0; i < pages_per_huge_page;
3731 i++, p = mem_map_next(p, page, i)) {
3732 cond_resched();
3733 clear_user_highpage(p, addr + i * PAGE_SIZE);
3734 }
3735}
3736void clear_huge_page(struct page *page,
3737 unsigned long addr, unsigned int pages_per_huge_page)
3738{
3739 int i;
3740
3741 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
3742 clear_gigantic_page(page, addr, pages_per_huge_page);
3743 return;
3744 }
3745
3746 might_sleep();
3747 for (i = 0; i < pages_per_huge_page; i++) {
3748 cond_resched();
3749 clear_user_highpage(page + i, addr + i * PAGE_SIZE);
3750 }
3751}
3752
3753static void copy_user_gigantic_page(struct page *dst, struct page *src,
3754 unsigned long addr,
3755 struct vm_area_struct *vma,
3756 unsigned int pages_per_huge_page)
3757{
3758 int i;
3759 struct page *dst_base = dst;
3760 struct page *src_base = src;
3761
3762 for (i = 0; i < pages_per_huge_page; ) {
3763 cond_resched();
3764 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
3765
3766 i++;
3767 dst = mem_map_next(dst, dst_base, i);
3768 src = mem_map_next(src, src_base, i);
3769 }
3770}
3771
3772void copy_user_huge_page(struct page *dst, struct page *src,
3773 unsigned long addr, struct vm_area_struct *vma,
3774 unsigned int pages_per_huge_page)
3775{
3776 int i;
3777
3778 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
3779 copy_user_gigantic_page(dst, src, addr, vma,
3780 pages_per_huge_page);
3781 return;
3782 }
3783
3784 might_sleep();
3785 for (i = 0; i < pages_per_huge_page; i++) {
3786 cond_resched();
3787 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
3788 }
3789}
3790#endif
3791