1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41#include <linux/kernel_stat.h>
42#include <linux/mm.h>
43#include <linux/hugetlb.h>
44#include <linux/mman.h>
45#include <linux/swap.h>
46#include <linux/highmem.h>
47#include <linux/pagemap.h>
48#include <linux/ksm.h>
49#include <linux/rmap.h>
50#include <linux/module.h>
51#include <linux/delayacct.h>
52#include <linux/init.h>
53#include <linux/writeback.h>
54#include <linux/memcontrol.h>
55#include <linux/mmu_notifier.h>
56#include <linux/kallsyms.h>
57#include <linux/swapops.h>
58#include <linux/elf.h>
59
60#include <asm/io.h>
61#include <asm/pgalloc.h>
62#include <asm/uaccess.h>
63#include <asm/tlb.h>
64#include <asm/tlbflush.h>
65#include <asm/pgtable.h>
66
67#include "internal.h"
68
69#ifndef CONFIG_NEED_MULTIPLE_NODES
70
71unsigned long max_mapnr;
72struct page *mem_map;
73
74EXPORT_SYMBOL(max_mapnr);
75EXPORT_SYMBOL(mem_map);
76#endif
77
78unsigned long num_physpages;
79
80
81
82
83
84
85
86void * high_memory;
87
88EXPORT_SYMBOL(num_physpages);
89EXPORT_SYMBOL(high_memory);
90
91
92
93
94
95
96
97int randomize_va_space __read_mostly =
98#ifdef CONFIG_COMPAT_BRK
99 1;
100#else
101 2;
102#endif
103
104static int __init disable_randmaps(char *s)
105{
106 randomize_va_space = 0;
107 return 1;
108}
109__setup("norandmaps", disable_randmaps);
110
111unsigned long zero_pfn __read_mostly;
112unsigned long highest_memmap_pfn __read_mostly;
113
114
115
116
117static int __init init_zero_pfn(void)
118{
119 zero_pfn = page_to_pfn(ZERO_PAGE(0));
120 return 0;
121}
122core_initcall(init_zero_pfn);
123
124
125
126
127
128
129
130void pgd_clear_bad(pgd_t *pgd)
131{
132 pgd_ERROR(*pgd);
133 pgd_clear(pgd);
134}
135
136void pud_clear_bad(pud_t *pud)
137{
138 pud_ERROR(*pud);
139 pud_clear(pud);
140}
141
142void pmd_clear_bad(pmd_t *pmd)
143{
144 pmd_ERROR(*pmd);
145 pmd_clear(pmd);
146}
147
148
149
150
151
152static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
153 unsigned long addr)
154{
155 pgtable_t token = pmd_pgtable(*pmd);
156 pmd_clear(pmd);
157 pte_free_tlb(tlb, token, addr);
158 tlb->mm->nr_ptes--;
159}
160
161static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
162 unsigned long addr, unsigned long end,
163 unsigned long floor, unsigned long ceiling)
164{
165 pmd_t *pmd;
166 unsigned long next;
167 unsigned long start;
168
169 start = addr;
170 pmd = pmd_offset(pud, addr);
171 do {
172 next = pmd_addr_end(addr, end);
173 if (pmd_none_or_clear_bad(pmd))
174 continue;
175 free_pte_range(tlb, pmd, addr);
176 } while (pmd++, addr = next, addr != end);
177
178 start &= PUD_MASK;
179 if (start < floor)
180 return;
181 if (ceiling) {
182 ceiling &= PUD_MASK;
183 if (!ceiling)
184 return;
185 }
186 if (end - 1 > ceiling - 1)
187 return;
188
189 pmd = pmd_offset(pud, start);
190 pud_clear(pud);
191 pmd_free_tlb(tlb, pmd, start);
192}
193
194static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
195 unsigned long addr, unsigned long end,
196 unsigned long floor, unsigned long ceiling)
197{
198 pud_t *pud;
199 unsigned long next;
200 unsigned long start;
201
202 start = addr;
203 pud = pud_offset(pgd, addr);
204 do {
205 next = pud_addr_end(addr, end);
206 if (pud_none_or_clear_bad(pud))
207 continue;
208 free_pmd_range(tlb, pud, addr, next, floor, ceiling);
209 } while (pud++, addr = next, addr != end);
210
211 start &= PGDIR_MASK;
212 if (start < floor)
213 return;
214 if (ceiling) {
215 ceiling &= PGDIR_MASK;
216 if (!ceiling)
217 return;
218 }
219 if (end - 1 > ceiling - 1)
220 return;
221
222 pud = pud_offset(pgd, start);
223 pgd_clear(pgd);
224 pud_free_tlb(tlb, pud, start);
225}
226
227
228
229
230
231
232void free_pgd_range(struct mmu_gather *tlb,
233 unsigned long addr, unsigned long end,
234 unsigned long floor, unsigned long ceiling)
235{
236 pgd_t *pgd;
237 unsigned long next;
238 unsigned long start;
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266 addr &= PMD_MASK;
267 if (addr < floor) {
268 addr += PMD_SIZE;
269 if (!addr)
270 return;
271 }
272 if (ceiling) {
273 ceiling &= PMD_MASK;
274 if (!ceiling)
275 return;
276 }
277 if (end - 1 > ceiling - 1)
278 end -= PMD_SIZE;
279 if (addr > end - 1)
280 return;
281
282 start = addr;
283 pgd = pgd_offset(tlb->mm, addr);
284 do {
285 next = pgd_addr_end(addr, end);
286 if (pgd_none_or_clear_bad(pgd))
287 continue;
288 free_pud_range(tlb, pgd, addr, next, floor, ceiling);
289 } while (pgd++, addr = next, addr != end);
290}
291
292void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
293 unsigned long floor, unsigned long ceiling)
294{
295 while (vma) {
296 struct vm_area_struct *next = vma->vm_next;
297 unsigned long addr = vma->vm_start;
298
299
300
301
302
303 anon_vma_unlink(vma);
304 unlink_file_vma(vma);
305
306 if (is_vm_hugetlb_page(vma)) {
307 hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
308 floor, next? next->vm_start: ceiling);
309 } else {
310
311
312
313 while (next && next->vm_start <= vma->vm_end + PMD_SIZE
314 && !is_vm_hugetlb_page(next)) {
315 vma = next;
316 next = vma->vm_next;
317 anon_vma_unlink(vma);
318 unlink_file_vma(vma);
319 }
320 free_pgd_range(tlb, addr, vma->vm_end,
321 floor, next? next->vm_start: ceiling);
322 }
323 vma = next;
324 }
325}
326
327int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
328{
329 pgtable_t new = pte_alloc_one(mm, address);
330 if (!new)
331 return -ENOMEM;
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346 smp_wmb();
347
348 spin_lock(&mm->page_table_lock);
349 if (!pmd_present(*pmd)) {
350 mm->nr_ptes++;
351 pmd_populate(mm, pmd, new);
352 new = NULL;
353 }
354 spin_unlock(&mm->page_table_lock);
355 if (new)
356 pte_free(mm, new);
357 return 0;
358}
359
360int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
361{
362 pte_t *new = pte_alloc_one_kernel(&init_mm, address);
363 if (!new)
364 return -ENOMEM;
365
366 smp_wmb();
367
368 spin_lock(&init_mm.page_table_lock);
369 if (!pmd_present(*pmd)) {
370 pmd_populate_kernel(&init_mm, pmd, new);
371 new = NULL;
372 }
373 spin_unlock(&init_mm.page_table_lock);
374 if (new)
375 pte_free_kernel(&init_mm, new);
376 return 0;
377}
378
379static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
380{
381 if (file_rss)
382 add_mm_counter(mm, file_rss, file_rss);
383 if (anon_rss)
384 add_mm_counter(mm, anon_rss, anon_rss);
385}
386
387
388
389
390
391
392
393
394static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
395 pte_t pte, struct page *page)
396{
397 pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
398 pud_t *pud = pud_offset(pgd, addr);
399 pmd_t *pmd = pmd_offset(pud, addr);
400 struct address_space *mapping;
401 pgoff_t index;
402 static unsigned long resume;
403 static unsigned long nr_shown;
404 static unsigned long nr_unshown;
405
406
407
408
409
410 if (nr_shown == 60) {
411 if (time_before(jiffies, resume)) {
412 nr_unshown++;
413 return;
414 }
415 if (nr_unshown) {
416 printk(KERN_ALERT
417 "BUG: Bad page map: %lu messages suppressed\n",
418 nr_unshown);
419 nr_unshown = 0;
420 }
421 nr_shown = 0;
422 }
423 if (nr_shown++ == 0)
424 resume = jiffies + 60 * HZ;
425
426 mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
427 index = linear_page_index(vma, addr);
428
429 printk(KERN_ALERT
430 "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
431 current->comm,
432 (long long)pte_val(pte), (long long)pmd_val(*pmd));
433 if (page) {
434 printk(KERN_ALERT
435 "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n",
436 page, (void *)page->flags, page_count(page),
437 page_mapcount(page), page->mapping, page->index);
438 }
439 printk(KERN_ALERT
440 "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
441 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
442
443
444
445 if (vma->vm_ops)
446 print_symbol(KERN_ALERT "vma->vm_ops->fault: %s\n",
447 (unsigned long)vma->vm_ops->fault);
448 if (vma->vm_file && vma->vm_file->f_op)
449 print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n",
450 (unsigned long)vma->vm_file->f_op->mmap);
451 dump_stack();
452 add_taint(TAINT_BAD_PAGE);
453}
454
455static inline int is_cow_mapping(unsigned int flags)
456{
457 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
458}
459
460#ifndef is_zero_pfn
461static inline int is_zero_pfn(unsigned long pfn)
462{
463 return pfn == zero_pfn;
464}
465#endif
466
467#ifndef my_zero_pfn
468static inline unsigned long my_zero_pfn(unsigned long addr)
469{
470 return zero_pfn;
471}
472#endif
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516#ifdef __HAVE_ARCH_PTE_SPECIAL
517# define HAVE_PTE_SPECIAL 1
518#else
519# define HAVE_PTE_SPECIAL 0
520#endif
521struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
522 pte_t pte)
523{
524 unsigned long pfn = pte_pfn(pte);
525
526 if (HAVE_PTE_SPECIAL) {
527 if (likely(!pte_special(pte)))
528 goto check_pfn;
529 if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
530 return NULL;
531 if (!is_zero_pfn(pfn))
532 print_bad_pte(vma, addr, pte, NULL);
533 return NULL;
534 }
535
536
537
538 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
539 if (vma->vm_flags & VM_MIXEDMAP) {
540 if (!pfn_valid(pfn))
541 return NULL;
542 goto out;
543 } else {
544 unsigned long off;
545 off = (addr - vma->vm_start) >> PAGE_SHIFT;
546 if (pfn == vma->vm_pgoff + off)
547 return NULL;
548 if (!is_cow_mapping(vma->vm_flags))
549 return NULL;
550 }
551 }
552
553 if (is_zero_pfn(pfn))
554 return NULL;
555check_pfn:
556 if (unlikely(pfn > highest_memmap_pfn)) {
557 print_bad_pte(vma, addr, pte, NULL);
558 return NULL;
559 }
560
561
562
563
564
565out:
566 return pfn_to_page(pfn);
567}
568
569
570
571
572
573
574
575static inline void
576copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
577 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
578 unsigned long addr, int *rss)
579{
580 unsigned long vm_flags = vma->vm_flags;
581 pte_t pte = *src_pte;
582 struct page *page;
583
584
585 if (unlikely(!pte_present(pte))) {
586 if (!pte_file(pte)) {
587 swp_entry_t entry = pte_to_swp_entry(pte);
588
589 swap_duplicate(entry);
590
591 if (unlikely(list_empty(&dst_mm->mmlist))) {
592 spin_lock(&mmlist_lock);
593 if (list_empty(&dst_mm->mmlist))
594 list_add(&dst_mm->mmlist,
595 &src_mm->mmlist);
596 spin_unlock(&mmlist_lock);
597 }
598 if (is_write_migration_entry(entry) &&
599 is_cow_mapping(vm_flags)) {
600
601
602
603
604 make_migration_entry_read(&entry);
605 pte = swp_entry_to_pte(entry);
606 set_pte_at(src_mm, addr, src_pte, pte);
607 }
608 }
609 goto out_set_pte;
610 }
611
612
613
614
615
616 if (is_cow_mapping(vm_flags)) {
617 ptep_set_wrprotect(src_mm, addr, src_pte);
618 pte = pte_wrprotect(pte);
619 }
620
621
622
623
624
625 if (vm_flags & VM_SHARED)
626 pte = pte_mkclean(pte);
627 pte = pte_mkold(pte);
628
629 page = vm_normal_page(vma, addr, pte);
630 if (page) {
631 get_page(page);
632 page_dup_rmap(page);
633 rss[PageAnon(page)]++;
634 }
635
636out_set_pte:
637 set_pte_at(dst_mm, addr, dst_pte, pte);
638}
639
640static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
641 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
642 unsigned long addr, unsigned long end)
643{
644 pte_t *orig_src_pte, *orig_dst_pte;
645 pte_t *src_pte, *dst_pte;
646 spinlock_t *src_ptl, *dst_ptl;
647 int progress = 0;
648 int rss[2];
649
650again:
651 rss[1] = rss[0] = 0;
652 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
653 if (!dst_pte)
654 return -ENOMEM;
655 src_pte = pte_offset_map_nested(src_pmd, addr);
656 src_ptl = pte_lockptr(src_mm, src_pmd);
657 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
658 orig_src_pte = src_pte;
659 orig_dst_pte = dst_pte;
660 arch_enter_lazy_mmu_mode();
661
662 do {
663
664
665
666
667 if (progress >= 32) {
668 progress = 0;
669 if (need_resched() ||
670 spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
671 break;
672 }
673 if (pte_none(*src_pte)) {
674 progress++;
675 continue;
676 }
677 copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);
678 progress += 8;
679 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
680
681 arch_leave_lazy_mmu_mode();
682 spin_unlock(src_ptl);
683 pte_unmap_nested(orig_src_pte);
684 add_mm_rss(dst_mm, rss[0], rss[1]);
685 pte_unmap_unlock(orig_dst_pte, dst_ptl);
686 cond_resched();
687 if (addr != end)
688 goto again;
689 return 0;
690}
691
692static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
693 pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
694 unsigned long addr, unsigned long end)
695{
696 pmd_t *src_pmd, *dst_pmd;
697 unsigned long next;
698
699 dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
700 if (!dst_pmd)
701 return -ENOMEM;
702 src_pmd = pmd_offset(src_pud, addr);
703 do {
704 next = pmd_addr_end(addr, end);
705 if (pmd_none_or_clear_bad(src_pmd))
706 continue;
707 if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
708 vma, addr, next))
709 return -ENOMEM;
710 } while (dst_pmd++, src_pmd++, addr = next, addr != end);
711 return 0;
712}
713
714static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
715 pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
716 unsigned long addr, unsigned long end)
717{
718 pud_t *src_pud, *dst_pud;
719 unsigned long next;
720
721 dst_pud = pud_alloc(dst_mm, dst_pgd, addr);
722 if (!dst_pud)
723 return -ENOMEM;
724 src_pud = pud_offset(src_pgd, addr);
725 do {
726 next = pud_addr_end(addr, end);
727 if (pud_none_or_clear_bad(src_pud))
728 continue;
729 if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
730 vma, addr, next))
731 return -ENOMEM;
732 } while (dst_pud++, src_pud++, addr = next, addr != end);
733 return 0;
734}
735
736int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
737 struct vm_area_struct *vma)
738{
739 pgd_t *src_pgd, *dst_pgd;
740 unsigned long next;
741 unsigned long addr = vma->vm_start;
742 unsigned long end = vma->vm_end;
743 int ret;
744
745
746
747
748
749
750
751 if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) {
752 if (!vma->anon_vma)
753 return 0;
754 }
755
756 if (is_vm_hugetlb_page(vma))
757 return copy_hugetlb_page_range(dst_mm, src_mm, vma);
758
759 if (unlikely(is_pfn_mapping(vma))) {
760
761
762
763
764 ret = track_pfn_vma_copy(vma);
765 if (ret)
766 return ret;
767 }
768
769
770
771
772
773
774
775 if (is_cow_mapping(vma->vm_flags))
776 mmu_notifier_invalidate_range_start(src_mm, addr, end);
777
778 ret = 0;
779 dst_pgd = pgd_offset(dst_mm, addr);
780 src_pgd = pgd_offset(src_mm, addr);
781 do {
782 next = pgd_addr_end(addr, end);
783 if (pgd_none_or_clear_bad(src_pgd))
784 continue;
785 if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
786 vma, addr, next))) {
787 ret = -ENOMEM;
788 break;
789 }
790 } while (dst_pgd++, src_pgd++, addr = next, addr != end);
791
792 if (is_cow_mapping(vma->vm_flags))
793 mmu_notifier_invalidate_range_end(src_mm,
794 vma->vm_start, end);
795 return ret;
796}
797
798static unsigned long zap_pte_range(struct mmu_gather *tlb,
799 struct vm_area_struct *vma, pmd_t *pmd,
800 unsigned long addr, unsigned long end,
801 long *zap_work, struct zap_details *details)
802{
803 struct mm_struct *mm = tlb->mm;
804 pte_t *pte;
805 spinlock_t *ptl;
806 int file_rss = 0;
807 int anon_rss = 0;
808
809 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
810 arch_enter_lazy_mmu_mode();
811 do {
812 pte_t ptent = *pte;
813 if (pte_none(ptent)) {
814 (*zap_work)--;
815 continue;
816 }
817
818 (*zap_work) -= PAGE_SIZE;
819
820 if (pte_present(ptent)) {
821 struct page *page;
822
823 page = vm_normal_page(vma, addr, ptent);
824 if (unlikely(details) && page) {
825
826
827
828
829
830 if (details->check_mapping &&
831 details->check_mapping != page->mapping)
832 continue;
833
834
835
836
837 if (details->nonlinear_vma &&
838 (page->index < details->first_index ||
839 page->index > details->last_index))
840 continue;
841 }
842 ptent = ptep_get_and_clear_full(mm, addr, pte,
843 tlb->fullmm);
844 tlb_remove_tlb_entry(tlb, pte, addr);
845 if (unlikely(!page))
846 continue;
847 if (unlikely(details) && details->nonlinear_vma
848 && linear_page_index(details->nonlinear_vma,
849 addr) != page->index)
850 set_pte_at(mm, addr, pte,
851 pgoff_to_pte(page->index));
852 if (PageAnon(page))
853 anon_rss--;
854 else {
855 if (pte_dirty(ptent))
856 set_page_dirty(page);
857 if (pte_young(ptent) &&
858 likely(!VM_SequentialReadHint(vma)))
859 mark_page_accessed(page);
860 file_rss--;
861 }
862 page_remove_rmap(page);
863 if (unlikely(page_mapcount(page) < 0))
864 print_bad_pte(vma, addr, ptent, page);
865 tlb_remove_page(tlb, page);
866 continue;
867 }
868
869
870
871
872 if (unlikely(details))
873 continue;
874 if (pte_file(ptent)) {
875 if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
876 print_bad_pte(vma, addr, ptent, NULL);
877 } else if
878 (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent))))
879 print_bad_pte(vma, addr, ptent, NULL);
880 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
881 } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
882
883 add_mm_rss(mm, file_rss, anon_rss);
884 arch_leave_lazy_mmu_mode();
885 pte_unmap_unlock(pte - 1, ptl);
886
887 return addr;
888}
889
890static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
891 struct vm_area_struct *vma, pud_t *pud,
892 unsigned long addr, unsigned long end,
893 long *zap_work, struct zap_details *details)
894{
895 pmd_t *pmd;
896 unsigned long next;
897
898 pmd = pmd_offset(pud, addr);
899 do {
900 next = pmd_addr_end(addr, end);
901 if (pmd_none_or_clear_bad(pmd)) {
902 (*zap_work)--;
903 continue;
904 }
905 next = zap_pte_range(tlb, vma, pmd, addr, next,
906 zap_work, details);
907 } while (pmd++, addr = next, (addr != end && *zap_work > 0));
908
909 return addr;
910}
911
912static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
913 struct vm_area_struct *vma, pgd_t *pgd,
914 unsigned long addr, unsigned long end,
915 long *zap_work, struct zap_details *details)
916{
917 pud_t *pud;
918 unsigned long next;
919
920 pud = pud_offset(pgd, addr);
921 do {
922 next = pud_addr_end(addr, end);
923 if (pud_none_or_clear_bad(pud)) {
924 (*zap_work)--;
925 continue;
926 }
927 next = zap_pmd_range(tlb, vma, pud, addr, next,
928 zap_work, details);
929 } while (pud++, addr = next, (addr != end && *zap_work > 0));
930
931 return addr;
932}
933
934static unsigned long unmap_page_range(struct mmu_gather *tlb,
935 struct vm_area_struct *vma,
936 unsigned long addr, unsigned long end,
937 long *zap_work, struct zap_details *details)
938{
939 pgd_t *pgd;
940 unsigned long next;
941
942 if (details && !details->check_mapping && !details->nonlinear_vma)
943 details = NULL;
944
945 BUG_ON(addr >= end);
946 tlb_start_vma(tlb, vma);
947 pgd = pgd_offset(vma->vm_mm, addr);
948 do {
949 next = pgd_addr_end(addr, end);
950 if (pgd_none_or_clear_bad(pgd)) {
951 (*zap_work)--;
952 continue;
953 }
954 next = zap_pud_range(tlb, vma, pgd, addr, next,
955 zap_work, details);
956 } while (pgd++, addr = next, (addr != end && *zap_work > 0));
957 tlb_end_vma(tlb, vma);
958
959 return addr;
960}
961
962#ifdef CONFIG_PREEMPT
963# define ZAP_BLOCK_SIZE (8 * PAGE_SIZE)
964#else
965
966# define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE)
967#endif
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995unsigned long unmap_vmas(struct mmu_gather **tlbp,
996 struct vm_area_struct *vma, unsigned long start_addr,
997 unsigned long end_addr, unsigned long *nr_accounted,
998 struct zap_details *details)
999{
1000 long zap_work = ZAP_BLOCK_SIZE;
1001 unsigned long tlb_start = 0;
1002 int tlb_start_valid = 0;
1003 unsigned long start = start_addr;
1004 spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
1005 int fullmm = (*tlbp)->fullmm;
1006 struct mm_struct *mm = vma->vm_mm;
1007
1008 mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
1009 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
1010 unsigned long end;
1011
1012 start = max(vma->vm_start, start_addr);
1013 if (start >= vma->vm_end)
1014 continue;
1015 end = min(vma->vm_end, end_addr);
1016 if (end <= vma->vm_start)
1017 continue;
1018
1019 if (vma->vm_flags & VM_ACCOUNT)
1020 *nr_accounted += (end - start) >> PAGE_SHIFT;
1021
1022 if (unlikely(is_pfn_mapping(vma)))
1023 untrack_pfn_vma(vma, 0, 0);
1024
1025 while (start != end) {
1026 if (!tlb_start_valid) {
1027 tlb_start = start;
1028 tlb_start_valid = 1;
1029 }
1030
1031 if (unlikely(is_vm_hugetlb_page(vma))) {
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043 if (vma->vm_file) {
1044 unmap_hugepage_range(vma, start, end, NULL);
1045 zap_work -= (end - start) /
1046 pages_per_huge_page(hstate_vma(vma));
1047 }
1048
1049 start = end;
1050 } else
1051 start = unmap_page_range(*tlbp, vma,
1052 start, end, &zap_work, details);
1053
1054 if (zap_work > 0) {
1055 BUG_ON(start != end);
1056 break;
1057 }
1058
1059 tlb_finish_mmu(*tlbp, tlb_start, start);
1060
1061 if (need_resched() ||
1062 (i_mmap_lock && spin_needbreak(i_mmap_lock))) {
1063 if (i_mmap_lock) {
1064 *tlbp = NULL;
1065 goto out;
1066 }
1067 cond_resched();
1068 }
1069
1070 *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
1071 tlb_start_valid = 0;
1072 zap_work = ZAP_BLOCK_SIZE;
1073 }
1074 }
1075out:
1076 mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
1077 return start;
1078}
1079
1080
1081
1082
1083
1084
1085
1086
1087unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
1088 unsigned long size, struct zap_details *details)
1089{
1090 struct mm_struct *mm = vma->vm_mm;
1091 struct mmu_gather *tlb;
1092 unsigned long end = address + size;
1093 unsigned long nr_accounted = 0;
1094
1095 lru_add_drain();
1096 tlb = tlb_gather_mmu(mm, 0);
1097 update_hiwater_rss(mm);
1098 end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
1099 if (tlb)
1100 tlb_finish_mmu(tlb, address, end);
1101 return end;
1102}
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
1117 unsigned long size)
1118{
1119 if (address < vma->vm_start || address + size > vma->vm_end ||
1120 !(vma->vm_flags & VM_PFNMAP))
1121 return -1;
1122 zap_page_range(vma, address, size, NULL);
1123 return 0;
1124}
1125EXPORT_SYMBOL_GPL(zap_vma_ptes);
1126
1127
1128
1129
1130struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1131 unsigned int flags)
1132{
1133 pgd_t *pgd;
1134 pud_t *pud;
1135 pmd_t *pmd;
1136 pte_t *ptep, pte;
1137 spinlock_t *ptl;
1138 struct page *page;
1139 struct mm_struct *mm = vma->vm_mm;
1140
1141 page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
1142 if (!IS_ERR(page)) {
1143 BUG_ON(flags & FOLL_GET);
1144 goto out;
1145 }
1146
1147 page = NULL;
1148 pgd = pgd_offset(mm, address);
1149 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
1150 goto no_page_table;
1151
1152 pud = pud_offset(pgd, address);
1153 if (pud_none(*pud))
1154 goto no_page_table;
1155 if (pud_huge(*pud)) {
1156 BUG_ON(flags & FOLL_GET);
1157 page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
1158 goto out;
1159 }
1160 if (unlikely(pud_bad(*pud)))
1161 goto no_page_table;
1162
1163 pmd = pmd_offset(pud, address);
1164 if (pmd_none(*pmd))
1165 goto no_page_table;
1166 if (pmd_huge(*pmd)) {
1167 BUG_ON(flags & FOLL_GET);
1168 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
1169 goto out;
1170 }
1171 if (unlikely(pmd_bad(*pmd)))
1172 goto no_page_table;
1173
1174 ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
1175
1176 pte = *ptep;
1177 if (!pte_present(pte))
1178 goto no_page;
1179 if ((flags & FOLL_WRITE) && !pte_write(pte))
1180 goto unlock;
1181
1182 page = vm_normal_page(vma, address, pte);
1183 if (unlikely(!page)) {
1184 if ((flags & FOLL_DUMP) ||
1185 !is_zero_pfn(pte_pfn(pte)))
1186 goto bad_page;
1187 page = pte_page(pte);
1188 }
1189
1190 if (flags & FOLL_GET)
1191 get_page(page);
1192 if (flags & FOLL_TOUCH) {
1193 if ((flags & FOLL_WRITE) &&
1194 !pte_dirty(pte) && !PageDirty(page))
1195 set_page_dirty(page);
1196
1197
1198
1199
1200
1201 mark_page_accessed(page);
1202 }
1203unlock:
1204 pte_unmap_unlock(ptep, ptl);
1205out:
1206 return page;
1207
1208bad_page:
1209 pte_unmap_unlock(ptep, ptl);
1210 return ERR_PTR(-EFAULT);
1211
1212no_page:
1213 pte_unmap_unlock(ptep, ptl);
1214 if (!pte_none(pte))
1215 return page;
1216
1217no_page_table:
1218
1219
1220
1221
1222
1223
1224
1225
1226 if ((flags & FOLL_DUMP) &&
1227 (!vma->vm_ops || !vma->vm_ops->fault))
1228 return ERR_PTR(-EFAULT);
1229 return page;
1230}
1231
1232int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1233 unsigned long start, int nr_pages, unsigned int gup_flags,
1234 struct page **pages, struct vm_area_struct **vmas)
1235{
1236 int i;
1237 unsigned long vm_flags;
1238
1239 if (nr_pages <= 0)
1240 return 0;
1241
1242 VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
1243
1244
1245
1246
1247
1248 vm_flags = (gup_flags & FOLL_WRITE) ?
1249 (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
1250 vm_flags &= (gup_flags & FOLL_FORCE) ?
1251 (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
1252 i = 0;
1253
1254 do {
1255 struct vm_area_struct *vma;
1256
1257 vma = find_extend_vma(mm, start);
1258 if (!vma && in_gate_area(tsk, start)) {
1259 unsigned long pg = start & PAGE_MASK;
1260 struct vm_area_struct *gate_vma = get_gate_vma(tsk);
1261 pgd_t *pgd;
1262 pud_t *pud;
1263 pmd_t *pmd;
1264 pte_t *pte;
1265
1266
1267 if (gup_flags & FOLL_WRITE)
1268 return i ? : -EFAULT;
1269 if (pg > TASK_SIZE)
1270 pgd = pgd_offset_k(pg);
1271 else
1272 pgd = pgd_offset_gate(mm, pg);
1273 BUG_ON(pgd_none(*pgd));
1274 pud = pud_offset(pgd, pg);
1275 BUG_ON(pud_none(*pud));
1276 pmd = pmd_offset(pud, pg);
1277 if (pmd_none(*pmd))
1278 return i ? : -EFAULT;
1279 pte = pte_offset_map(pmd, pg);
1280 if (pte_none(*pte)) {
1281 pte_unmap(pte);
1282 return i ? : -EFAULT;
1283 }
1284 if (pages) {
1285 struct page *page = vm_normal_page(gate_vma, start, *pte);
1286 pages[i] = page;
1287 if (page)
1288 get_page(page);
1289 }
1290 pte_unmap(pte);
1291 if (vmas)
1292 vmas[i] = gate_vma;
1293 i++;
1294 start += PAGE_SIZE;
1295 nr_pages--;
1296 continue;
1297 }
1298
1299 if (!vma ||
1300 (vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
1301 !(vm_flags & vma->vm_flags))
1302 return i ? : -EFAULT;
1303
1304 if (is_vm_hugetlb_page(vma)) {
1305 i = follow_hugetlb_page(mm, vma, pages, vmas,
1306 &start, &nr_pages, i, gup_flags);
1307 continue;
1308 }
1309
1310 do {
1311 struct page *page;
1312 unsigned int foll_flags = gup_flags;
1313
1314
1315
1316
1317
1318 if (unlikely(fatal_signal_pending(current)))
1319 return i ? i : -ERESTARTSYS;
1320
1321 cond_resched();
1322 while (!(page = follow_page(vma, start, foll_flags))) {
1323 int ret;
1324
1325 ret = handle_mm_fault(mm, vma, start,
1326 (foll_flags & FOLL_WRITE) ?
1327 FAULT_FLAG_WRITE : 0);
1328
1329 if (ret & VM_FAULT_ERROR) {
1330 if (ret & VM_FAULT_OOM)
1331 return i ? i : -ENOMEM;
1332 if (ret &
1333 (VM_FAULT_HWPOISON|VM_FAULT_SIGBUS))
1334 return i ? i : -EFAULT;
1335 BUG();
1336 }
1337 if (ret & VM_FAULT_MAJOR)
1338 tsk->maj_flt++;
1339 else
1340 tsk->min_flt++;
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354 if ((ret & VM_FAULT_WRITE) &&
1355 !(vma->vm_flags & VM_WRITE))
1356 foll_flags &= ~FOLL_WRITE;
1357
1358 cond_resched();
1359 }
1360 if (IS_ERR(page))
1361 return i ? i : PTR_ERR(page);
1362 if (pages) {
1363 pages[i] = page;
1364
1365 flush_anon_page(vma, page, start);
1366 flush_dcache_page(page);
1367 }
1368 if (vmas)
1369 vmas[i] = vma;
1370 i++;
1371 start += PAGE_SIZE;
1372 nr_pages--;
1373 } while (nr_pages && start < vma->vm_end);
1374 } while (nr_pages);
1375 return i;
1376}
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1429 unsigned long start, int nr_pages, int write, int force,
1430 struct page **pages, struct vm_area_struct **vmas)
1431{
1432 int flags = FOLL_TOUCH;
1433
1434 if (pages)
1435 flags |= FOLL_GET;
1436 if (write)
1437 flags |= FOLL_WRITE;
1438 if (force)
1439 flags |= FOLL_FORCE;
1440
1441 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas);
1442}
1443EXPORT_SYMBOL(get_user_pages);
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459#ifdef CONFIG_ELF_CORE
1460struct page *get_dump_page(unsigned long addr)
1461{
1462 struct vm_area_struct *vma;
1463 struct page *page;
1464
1465 if (__get_user_pages(current, current->mm, addr, 1,
1466 FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma) < 1)
1467 return NULL;
1468 flush_cache_page(vma, addr, page_to_pfn(page));
1469 return page;
1470}
1471#endif
1472
1473pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
1474 spinlock_t **ptl)
1475{
1476 pgd_t * pgd = pgd_offset(mm, addr);
1477 pud_t * pud = pud_alloc(mm, pgd, addr);
1478 if (pud) {
1479 pmd_t * pmd = pmd_alloc(mm, pud, addr);
1480 if (pmd)
1481 return pte_alloc_map_lock(mm, pmd, addr, ptl);
1482 }
1483 return NULL;
1484}
1485
1486
1487
1488
1489
1490
1491
1492
1493static int insert_page(struct vm_area_struct *vma, unsigned long addr,
1494 struct page *page, pgprot_t prot)
1495{
1496 struct mm_struct *mm = vma->vm_mm;
1497 int retval;
1498 pte_t *pte;
1499 spinlock_t *ptl;
1500
1501 retval = -EINVAL;
1502 if (PageAnon(page))
1503 goto out;
1504 retval = -ENOMEM;
1505 flush_dcache_page(page);
1506 pte = get_locked_pte(mm, addr, &ptl);
1507 if (!pte)
1508 goto out;
1509 retval = -EBUSY;
1510 if (!pte_none(*pte))
1511 goto out_unlock;
1512
1513
1514 get_page(page);
1515 inc_mm_counter(mm, file_rss);
1516 page_add_file_rmap(page);
1517 set_pte_at(mm, addr, pte, mk_pte(page, prot));
1518
1519 retval = 0;
1520 pte_unmap_unlock(pte, ptl);
1521 return retval;
1522out_unlock:
1523 pte_unmap_unlock(pte, ptl);
1524out:
1525 return retval;
1526}
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
1551 struct page *page)
1552{
1553 if (addr < vma->vm_start || addr >= vma->vm_end)
1554 return -EFAULT;
1555 if (!page_count(page))
1556 return -EINVAL;
1557 vma->vm_flags |= VM_INSERTPAGE;
1558 return insert_page(vma, addr, page, vma->vm_page_prot);
1559}
1560EXPORT_SYMBOL(vm_insert_page);
1561
1562static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1563 unsigned long pfn, pgprot_t prot)
1564{
1565 struct mm_struct *mm = vma->vm_mm;
1566 int retval;
1567 pte_t *pte, entry;
1568 spinlock_t *ptl;
1569
1570 retval = -ENOMEM;
1571 pte = get_locked_pte(mm, addr, &ptl);
1572 if (!pte)
1573 goto out;
1574 retval = -EBUSY;
1575 if (!pte_none(*pte))
1576 goto out_unlock;
1577
1578
1579 entry = pte_mkspecial(pfn_pte(pfn, prot));
1580 set_pte_at(mm, addr, pte, entry);
1581 update_mmu_cache(vma, addr, entry);
1582
1583 retval = 0;
1584out_unlock:
1585 pte_unmap_unlock(pte, ptl);
1586out:
1587 return retval;
1588}
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1608 unsigned long pfn)
1609{
1610 int ret;
1611 pgprot_t pgprot = vma->vm_page_prot;
1612
1613
1614
1615
1616
1617
1618 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
1619 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
1620 (VM_PFNMAP|VM_MIXEDMAP));
1621 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
1622 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
1623
1624 if (addr < vma->vm_start || addr >= vma->vm_end)
1625 return -EFAULT;
1626 if (track_pfn_vma_new(vma, &pgprot, pfn, PAGE_SIZE))
1627 return -EINVAL;
1628
1629 ret = insert_pfn(vma, addr, pfn, pgprot);
1630
1631 if (ret)
1632 untrack_pfn_vma(vma, pfn, PAGE_SIZE);
1633
1634 return ret;
1635}
1636EXPORT_SYMBOL(vm_insert_pfn);
1637
1638int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
1639 unsigned long pfn)
1640{
1641 BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
1642
1643 if (addr < vma->vm_start || addr >= vma->vm_end)
1644 return -EFAULT;
1645
1646
1647
1648
1649
1650
1651
1652
1653 if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) {
1654 struct page *page;
1655
1656 page = pfn_to_page(pfn);
1657 return insert_page(vma, addr, page, vma->vm_page_prot);
1658 }
1659 return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
1660}
1661EXPORT_SYMBOL(vm_insert_mixed);
1662
1663
1664
1665
1666
1667
1668static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1669 unsigned long addr, unsigned long end,
1670 unsigned long pfn, pgprot_t prot)
1671{
1672 pte_t *pte;
1673 spinlock_t *ptl;
1674
1675 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
1676 if (!pte)
1677 return -ENOMEM;
1678 arch_enter_lazy_mmu_mode();
1679 do {
1680 BUG_ON(!pte_none(*pte));
1681 set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
1682 pfn++;
1683 } while (pte++, addr += PAGE_SIZE, addr != end);
1684 arch_leave_lazy_mmu_mode();
1685 pte_unmap_unlock(pte - 1, ptl);
1686 return 0;
1687}
1688
1689static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
1690 unsigned long addr, unsigned long end,
1691 unsigned long pfn, pgprot_t prot)
1692{
1693 pmd_t *pmd;
1694 unsigned long next;
1695
1696 pfn -= addr >> PAGE_SHIFT;
1697 pmd = pmd_alloc(mm, pud, addr);
1698 if (!pmd)
1699 return -ENOMEM;
1700 do {
1701 next = pmd_addr_end(addr, end);
1702 if (remap_pte_range(mm, pmd, addr, next,
1703 pfn + (addr >> PAGE_SHIFT), prot))
1704 return -ENOMEM;
1705 } while (pmd++, addr = next, addr != end);
1706 return 0;
1707}
1708
1709static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
1710 unsigned long addr, unsigned long end,
1711 unsigned long pfn, pgprot_t prot)
1712{
1713 pud_t *pud;
1714 unsigned long next;
1715
1716 pfn -= addr >> PAGE_SHIFT;
1717 pud = pud_alloc(mm, pgd, addr);
1718 if (!pud)
1719 return -ENOMEM;
1720 do {
1721 next = pud_addr_end(addr, end);
1722 if (remap_pmd_range(mm, pud, addr, next,
1723 pfn + (addr >> PAGE_SHIFT), prot))
1724 return -ENOMEM;
1725 } while (pud++, addr = next, addr != end);
1726 return 0;
1727}
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1740 unsigned long pfn, unsigned long size, pgprot_t prot)
1741{
1742 pgd_t *pgd;
1743 unsigned long next;
1744 unsigned long end = addr + PAGE_ALIGN(size);
1745 struct mm_struct *mm = vma->vm_mm;
1746 int err;
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766 if (addr == vma->vm_start && end == vma->vm_end) {
1767 vma->vm_pgoff = pfn;
1768 vma->vm_flags |= VM_PFN_AT_MMAP;
1769 } else if (is_cow_mapping(vma->vm_flags))
1770 return -EINVAL;
1771
1772 vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
1773
1774 err = track_pfn_vma_new(vma, &prot, pfn, PAGE_ALIGN(size));
1775 if (err) {
1776
1777
1778
1779
1780 vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP);
1781 vma->vm_flags &= ~VM_PFN_AT_MMAP;
1782 return -EINVAL;
1783 }
1784
1785 BUG_ON(addr >= end);
1786 pfn -= addr >> PAGE_SHIFT;
1787 pgd = pgd_offset(mm, addr);
1788 flush_cache_range(vma, addr, end);
1789 do {
1790 next = pgd_addr_end(addr, end);
1791 err = remap_pud_range(mm, pgd, addr, next,
1792 pfn + (addr >> PAGE_SHIFT), prot);
1793 if (err)
1794 break;
1795 } while (pgd++, addr = next, addr != end);
1796
1797 if (err)
1798 untrack_pfn_vma(vma, pfn, PAGE_ALIGN(size));
1799
1800 return err;
1801}
1802EXPORT_SYMBOL(remap_pfn_range);
1803
1804static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
1805 unsigned long addr, unsigned long end,
1806 pte_fn_t fn, void *data)
1807{
1808 pte_t *pte;
1809 int err;
1810 pgtable_t token;
1811 spinlock_t *uninitialized_var(ptl);
1812
1813 pte = (mm == &init_mm) ?
1814 pte_alloc_kernel(pmd, addr) :
1815 pte_alloc_map_lock(mm, pmd, addr, &ptl);
1816 if (!pte)
1817 return -ENOMEM;
1818
1819 BUG_ON(pmd_huge(*pmd));
1820
1821 arch_enter_lazy_mmu_mode();
1822
1823 token = pmd_pgtable(*pmd);
1824
1825 do {
1826 err = fn(pte++, token, addr, data);
1827 if (err)
1828 break;
1829 } while (addr += PAGE_SIZE, addr != end);
1830
1831 arch_leave_lazy_mmu_mode();
1832
1833 if (mm != &init_mm)
1834 pte_unmap_unlock(pte-1, ptl);
1835 return err;
1836}
1837
1838static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
1839 unsigned long addr, unsigned long end,
1840 pte_fn_t fn, void *data)
1841{
1842 pmd_t *pmd;
1843 unsigned long next;
1844 int err;
1845
1846 BUG_ON(pud_huge(*pud));
1847
1848 pmd = pmd_alloc(mm, pud, addr);
1849 if (!pmd)
1850 return -ENOMEM;
1851 do {
1852 next = pmd_addr_end(addr, end);
1853 err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
1854 if (err)
1855 break;
1856 } while (pmd++, addr = next, addr != end);
1857 return err;
1858}
1859
1860static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd,
1861 unsigned long addr, unsigned long end,
1862 pte_fn_t fn, void *data)
1863{
1864 pud_t *pud;
1865 unsigned long next;
1866 int err;
1867
1868 pud = pud_alloc(mm, pgd, addr);
1869 if (!pud)
1870 return -ENOMEM;
1871 do {
1872 next = pud_addr_end(addr, end);
1873 err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
1874 if (err)
1875 break;
1876 } while (pud++, addr = next, addr != end);
1877 return err;
1878}
1879
1880
1881
1882
1883
1884int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
1885 unsigned long size, pte_fn_t fn, void *data)
1886{
1887 pgd_t *pgd;
1888 unsigned long next;
1889 unsigned long start = addr, end = addr + size;
1890 int err;
1891
1892 BUG_ON(addr >= end);
1893 mmu_notifier_invalidate_range_start(mm, start, end);
1894 pgd = pgd_offset(mm, addr);
1895 do {
1896 next = pgd_addr_end(addr, end);
1897 err = apply_to_pud_range(mm, pgd, addr, next, fn, data);
1898 if (err)
1899 break;
1900 } while (pgd++, addr = next, addr != end);
1901 mmu_notifier_invalidate_range_end(mm, start, end);
1902 return err;
1903}
1904EXPORT_SYMBOL_GPL(apply_to_page_range);
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
1916 pte_t *page_table, pte_t orig_pte)
1917{
1918 int same = 1;
1919#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
1920 if (sizeof(pte_t) > sizeof(unsigned long)) {
1921 spinlock_t *ptl = pte_lockptr(mm, pmd);
1922 spin_lock(ptl);
1923 same = pte_same(*page_table, orig_pte);
1924 spin_unlock(ptl);
1925 }
1926#endif
1927 pte_unmap(page_table);
1928 return same;
1929}
1930
1931
1932
1933
1934
1935
1936
1937static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
1938{
1939 if (likely(vma->vm_flags & VM_WRITE))
1940 pte = pte_mkwrite(pte);
1941 return pte;
1942}
1943
1944static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
1945{
1946
1947
1948
1949
1950
1951
1952 if (unlikely(!src)) {
1953 void *kaddr = kmap_atomic(dst, KM_USER0);
1954 void __user *uaddr = (void __user *)(va & PAGE_MASK);
1955
1956
1957
1958
1959
1960
1961
1962 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
1963 memset(kaddr, 0, PAGE_SIZE);
1964 kunmap_atomic(kaddr, KM_USER0);
1965 flush_dcache_page(dst);
1966 } else
1967 copy_user_highpage(dst, src, va, vma);
1968}
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1989 unsigned long address, pte_t *page_table, pmd_t *pmd,
1990 spinlock_t *ptl, pte_t orig_pte)
1991{
1992 struct page *old_page, *new_page;
1993 pte_t entry;
1994 int reuse = 0, ret = 0;
1995 int page_mkwrite = 0;
1996 struct page *dirty_page = NULL;
1997
1998 old_page = vm_normal_page(vma, address, orig_pte);
1999 if (!old_page) {
2000
2001
2002
2003
2004
2005
2006
2007 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2008 (VM_WRITE|VM_SHARED))
2009 goto reuse;
2010 goto gotten;
2011 }
2012
2013
2014
2015
2016
2017 if (PageAnon(old_page) && !PageKsm(old_page)) {
2018 if (!trylock_page(old_page)) {
2019 page_cache_get(old_page);
2020 pte_unmap_unlock(page_table, ptl);
2021 lock_page(old_page);
2022 page_table = pte_offset_map_lock(mm, pmd, address,
2023 &ptl);
2024 if (!pte_same(*page_table, orig_pte)) {
2025 unlock_page(old_page);
2026 page_cache_release(old_page);
2027 goto unlock;
2028 }
2029 page_cache_release(old_page);
2030 }
2031 reuse = reuse_swap_page(old_page);
2032 unlock_page(old_page);
2033 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2034 (VM_WRITE|VM_SHARED))) {
2035
2036
2037
2038
2039
2040 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
2041 struct vm_fault vmf;
2042 int tmp;
2043
2044 vmf.virtual_address = (void __user *)(address &
2045 PAGE_MASK);
2046 vmf.pgoff = old_page->index;
2047 vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
2048 vmf.page = old_page;
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058 page_cache_get(old_page);
2059 pte_unmap_unlock(page_table, ptl);
2060
2061 tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
2062 if (unlikely(tmp &
2063 (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
2064 ret = tmp;
2065 goto unwritable_page;
2066 }
2067 if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
2068 lock_page(old_page);
2069 if (!old_page->mapping) {
2070 ret = 0;
2071 unlock_page(old_page);
2072 goto unwritable_page;
2073 }
2074 } else
2075 VM_BUG_ON(!PageLocked(old_page));
2076
2077
2078
2079
2080
2081
2082
2083 page_table = pte_offset_map_lock(mm, pmd, address,
2084 &ptl);
2085 if (!pte_same(*page_table, orig_pte)) {
2086 unlock_page(old_page);
2087 page_cache_release(old_page);
2088 goto unlock;
2089 }
2090
2091 page_mkwrite = 1;
2092 }
2093 dirty_page = old_page;
2094 get_page(dirty_page);
2095 reuse = 1;
2096 }
2097
2098 if (reuse) {
2099reuse:
2100 flush_cache_page(vma, address, pte_pfn(orig_pte));
2101 entry = pte_mkyoung(orig_pte);
2102 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2103 if (ptep_set_access_flags(vma, address, page_table, entry,1))
2104 update_mmu_cache(vma, address, entry);
2105 ret |= VM_FAULT_WRITE;
2106 goto unlock;
2107 }
2108
2109
2110
2111
2112 page_cache_get(old_page);
2113gotten:
2114 pte_unmap_unlock(page_table, ptl);
2115
2116 if (unlikely(anon_vma_prepare(vma)))
2117 goto oom;
2118
2119 if (is_zero_pfn(pte_pfn(orig_pte))) {
2120 new_page = alloc_zeroed_user_highpage_movable(vma, address);
2121 if (!new_page)
2122 goto oom;
2123 } else {
2124 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
2125 if (!new_page)
2126 goto oom;
2127 cow_user_page(new_page, old_page, address, vma);
2128 }
2129 __SetPageUptodate(new_page);
2130
2131
2132
2133
2134
2135 if ((vma->vm_flags & VM_LOCKED) && old_page) {
2136 lock_page(old_page);
2137 clear_page_mlock(old_page);
2138 unlock_page(old_page);
2139 }
2140
2141 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
2142 goto oom_free_new;
2143
2144
2145
2146
2147 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2148 if (likely(pte_same(*page_table, orig_pte))) {
2149 if (old_page) {
2150 if (!PageAnon(old_page)) {
2151 dec_mm_counter(mm, file_rss);
2152 inc_mm_counter(mm, anon_rss);
2153 }
2154 } else
2155 inc_mm_counter(mm, anon_rss);
2156 flush_cache_page(vma, address, pte_pfn(orig_pte));
2157 entry = mk_pte(new_page, vma->vm_page_prot);
2158 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2159
2160
2161
2162
2163
2164
2165 ptep_clear_flush(vma, address, page_table);
2166 page_add_new_anon_rmap(new_page, vma, address);
2167
2168
2169
2170
2171
2172 set_pte_at_notify(mm, address, page_table, entry);
2173 update_mmu_cache(vma, address, entry);
2174 if (old_page) {
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197 page_remove_rmap(old_page);
2198 }
2199
2200
2201 new_page = old_page;
2202 ret |= VM_FAULT_WRITE;
2203 } else
2204 mem_cgroup_uncharge_page(new_page);
2205
2206 if (new_page)
2207 page_cache_release(new_page);
2208 if (old_page)
2209 page_cache_release(old_page);
2210unlock:
2211 pte_unmap_unlock(page_table, ptl);
2212 if (dirty_page) {
2213
2214
2215
2216
2217
2218
2219
2220
2221 if (!page_mkwrite) {
2222 wait_on_page_locked(dirty_page);
2223 set_page_dirty_balance(dirty_page, page_mkwrite);
2224 }
2225 put_page(dirty_page);
2226 if (page_mkwrite) {
2227 struct address_space *mapping = dirty_page->mapping;
2228
2229 set_page_dirty(dirty_page);
2230 unlock_page(dirty_page);
2231 page_cache_release(dirty_page);
2232 if (mapping) {
2233
2234
2235
2236
2237 balance_dirty_pages_ratelimited(mapping);
2238 }
2239 }
2240
2241
2242 if (vma->vm_file)
2243 file_update_time(vma->vm_file);
2244 }
2245 return ret;
2246oom_free_new:
2247 page_cache_release(new_page);
2248oom:
2249 if (old_page) {
2250 if (page_mkwrite) {
2251 unlock_page(old_page);
2252 page_cache_release(old_page);
2253 }
2254 page_cache_release(old_page);
2255 }
2256 return VM_FAULT_OOM;
2257
2258unwritable_page:
2259 page_cache_release(old_page);
2260 return ret;
2261}
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295#define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK))
2296
2297static void reset_vma_truncate_counts(struct address_space *mapping)
2298{
2299 struct vm_area_struct *vma;
2300 struct prio_tree_iter iter;
2301
2302 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
2303 vma->vm_truncate_count = 0;
2304 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
2305 vma->vm_truncate_count = 0;
2306}
2307
2308static int unmap_mapping_range_vma(struct vm_area_struct *vma,
2309 unsigned long start_addr, unsigned long end_addr,
2310 struct zap_details *details)
2311{
2312 unsigned long restart_addr;
2313 int need_break;
2314
2315
2316
2317
2318
2319
2320
2321
2322again:
2323 restart_addr = vma->vm_truncate_count;
2324 if (is_restart_addr(restart_addr) && start_addr < restart_addr) {
2325 start_addr = restart_addr;
2326 if (start_addr >= end_addr) {
2327
2328 vma->vm_truncate_count = details->truncate_count;
2329 return 0;
2330 }
2331 }
2332
2333 restart_addr = zap_page_range(vma, start_addr,
2334 end_addr - start_addr, details);
2335 need_break = need_resched() || spin_needbreak(details->i_mmap_lock);
2336
2337 if (restart_addr >= end_addr) {
2338
2339 vma->vm_truncate_count = details->truncate_count;
2340 if (!need_break)
2341 return 0;
2342 } else {
2343
2344 vma->vm_truncate_count = restart_addr;
2345 if (!need_break)
2346 goto again;
2347 }
2348
2349 spin_unlock(details->i_mmap_lock);
2350 cond_resched();
2351 spin_lock(details->i_mmap_lock);
2352 return -EINTR;
2353}
2354
2355static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
2356 struct zap_details *details)
2357{
2358 struct vm_area_struct *vma;
2359 struct prio_tree_iter iter;
2360 pgoff_t vba, vea, zba, zea;
2361
2362restart:
2363 vma_prio_tree_foreach(vma, &iter, root,
2364 details->first_index, details->last_index) {
2365
2366 if (vma->vm_truncate_count == details->truncate_count)
2367 continue;
2368
2369 vba = vma->vm_pgoff;
2370 vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;
2371
2372 zba = details->first_index;
2373 if (zba < vba)
2374 zba = vba;
2375 zea = details->last_index;
2376 if (zea > vea)
2377 zea = vea;
2378
2379 if (unmap_mapping_range_vma(vma,
2380 ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
2381 ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
2382 details) < 0)
2383 goto restart;
2384 }
2385}
2386
2387static inline void unmap_mapping_range_list(struct list_head *head,
2388 struct zap_details *details)
2389{
2390 struct vm_area_struct *vma;
2391
2392
2393
2394
2395
2396
2397
2398restart:
2399 list_for_each_entry(vma, head, shared.vm_set.list) {
2400
2401 if (vma->vm_truncate_count == details->truncate_count)
2402 continue;
2403 details->nonlinear_vma = vma;
2404 if (unmap_mapping_range_vma(vma, vma->vm_start,
2405 vma->vm_end, details) < 0)
2406 goto restart;
2407 }
2408}
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424void unmap_mapping_range(struct address_space *mapping,
2425 loff_t const holebegin, loff_t const holelen, int even_cows)
2426{
2427 struct zap_details details;
2428 pgoff_t hba = holebegin >> PAGE_SHIFT;
2429 pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2430
2431
2432 if (sizeof(holelen) > sizeof(hlen)) {
2433 long long holeend =
2434 (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2435 if (holeend & ~(long long)ULONG_MAX)
2436 hlen = ULONG_MAX - hba + 1;
2437 }
2438
2439 details.check_mapping = even_cows? NULL: mapping;
2440 details.nonlinear_vma = NULL;
2441 details.first_index = hba;
2442 details.last_index = hba + hlen - 1;
2443 if (details.last_index < details.first_index)
2444 details.last_index = ULONG_MAX;
2445 details.i_mmap_lock = &mapping->i_mmap_lock;
2446
2447 spin_lock(&mapping->i_mmap_lock);
2448
2449
2450 mapping->truncate_count++;
2451 if (unlikely(is_restart_addr(mapping->truncate_count))) {
2452 if (mapping->truncate_count == 0)
2453 reset_vma_truncate_counts(mapping);
2454 mapping->truncate_count++;
2455 }
2456 details.truncate_count = mapping->truncate_count;
2457
2458 if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
2459 unmap_mapping_range_tree(&mapping->i_mmap, &details);
2460 if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
2461 unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
2462 spin_unlock(&mapping->i_mmap_lock);
2463}
2464EXPORT_SYMBOL(unmap_mapping_range);
2465
2466int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
2467{
2468 struct address_space *mapping = inode->i_mapping;
2469
2470
2471
2472
2473
2474
2475 if (!inode->i_op->truncate_range)
2476 return -ENOSYS;
2477
2478 mutex_lock(&inode->i_mutex);
2479 down_write(&inode->i_alloc_sem);
2480 unmap_mapping_range(mapping, offset, (end - offset), 1);
2481 truncate_inode_pages_range(mapping, offset, end);
2482 unmap_mapping_range(mapping, offset, (end - offset), 1);
2483 inode->i_op->truncate_range(inode, offset, end);
2484 up_write(&inode->i_alloc_sem);
2485 mutex_unlock(&inode->i_mutex);
2486
2487 return 0;
2488}
2489
2490
2491
2492
2493
2494
2495static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2496 unsigned long address, pte_t *page_table, pmd_t *pmd,
2497 unsigned int flags, pte_t orig_pte)
2498{
2499 spinlock_t *ptl;
2500 struct page *page;
2501 swp_entry_t entry;
2502 pte_t pte;
2503 struct mem_cgroup *ptr = NULL;
2504 int ret = 0;
2505
2506 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
2507 goto out;
2508
2509 entry = pte_to_swp_entry(orig_pte);
2510 if (unlikely(non_swap_entry(entry))) {
2511 if (is_migration_entry(entry)) {
2512 migration_entry_wait(mm, pmd, address);
2513 } else if (is_hwpoison_entry(entry)) {
2514 ret = VM_FAULT_HWPOISON;
2515 } else {
2516 print_bad_pte(vma, address, orig_pte, NULL);
2517 ret = VM_FAULT_OOM;
2518 }
2519 goto out;
2520 }
2521 delayacct_set_flag(DELAYACCT_PF_SWAPIN);
2522 page = lookup_swap_cache(entry);
2523 if (!page) {
2524 grab_swap_token(mm);
2525 page = swapin_readahead(entry,
2526 GFP_HIGHUSER_MOVABLE, vma, address);
2527 if (!page) {
2528
2529
2530
2531
2532 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2533 if (likely(pte_same(*page_table, orig_pte)))
2534 ret = VM_FAULT_OOM;
2535 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2536 goto unlock;
2537 }
2538
2539
2540 ret = VM_FAULT_MAJOR;
2541 count_vm_event(PGMAJFAULT);
2542 } else if (PageHWPoison(page)) {
2543 ret = VM_FAULT_HWPOISON;
2544 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2545 goto out_release;
2546 }
2547
2548 lock_page(page);
2549 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2550
2551 if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
2552 ret = VM_FAULT_OOM;
2553 goto out_page;
2554 }
2555
2556
2557
2558
2559 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2560 if (unlikely(!pte_same(*page_table, orig_pte)))
2561 goto out_nomap;
2562
2563 if (unlikely(!PageUptodate(page))) {
2564 ret = VM_FAULT_SIGBUS;
2565 goto out_nomap;
2566 }
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582 inc_mm_counter(mm, anon_rss);
2583 pte = mk_pte(page, vma->vm_page_prot);
2584 if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
2585 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
2586 flags &= ~FAULT_FLAG_WRITE;
2587 }
2588 flush_icache_page(vma, page);
2589 set_pte_at(mm, address, page_table, pte);
2590 page_add_anon_rmap(page, vma, address);
2591
2592 mem_cgroup_commit_charge_swapin(page, ptr);
2593
2594 swap_free(entry);
2595 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
2596 try_to_free_swap(page);
2597 unlock_page(page);
2598
2599 if (flags & FAULT_FLAG_WRITE) {
2600 ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
2601 if (ret & VM_FAULT_ERROR)
2602 ret &= VM_FAULT_ERROR;
2603 goto out;
2604 }
2605
2606
2607 update_mmu_cache(vma, address, pte);
2608unlock:
2609 pte_unmap_unlock(page_table, ptl);
2610out:
2611 return ret;
2612out_nomap:
2613 mem_cgroup_cancel_charge_swapin(ptr);
2614 pte_unmap_unlock(page_table, ptl);
2615out_page:
2616 unlock_page(page);
2617out_release:
2618 page_cache_release(page);
2619 return ret;
2620}
2621
2622
2623
2624
2625
2626
2627static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2628 unsigned long address, pte_t *page_table, pmd_t *pmd,
2629 unsigned int flags)
2630{
2631 struct page *page;
2632 spinlock_t *ptl;
2633 pte_t entry;
2634
2635 if (!(flags & FAULT_FLAG_WRITE)) {
2636 entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
2637 vma->vm_page_prot));
2638 ptl = pte_lockptr(mm, pmd);
2639 spin_lock(ptl);
2640 if (!pte_none(*page_table))
2641 goto unlock;
2642 goto setpte;
2643 }
2644
2645
2646 pte_unmap(page_table);
2647
2648 if (unlikely(anon_vma_prepare(vma)))
2649 goto oom;
2650 page = alloc_zeroed_user_highpage_movable(vma, address);
2651 if (!page)
2652 goto oom;
2653 __SetPageUptodate(page);
2654
2655 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))
2656 goto oom_free_page;
2657
2658 entry = mk_pte(page, vma->vm_page_prot);
2659 if (vma->vm_flags & VM_WRITE)
2660 entry = pte_mkwrite(pte_mkdirty(entry));
2661
2662 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2663 if (!pte_none(*page_table))
2664 goto release;
2665
2666 inc_mm_counter(mm, anon_rss);
2667 page_add_new_anon_rmap(page, vma, address);
2668setpte:
2669 set_pte_at(mm, address, page_table, entry);
2670
2671
2672 update_mmu_cache(vma, address, entry);
2673unlock:
2674 pte_unmap_unlock(page_table, ptl);
2675 return 0;
2676release:
2677 mem_cgroup_uncharge_page(page);
2678 page_cache_release(page);
2679 goto unlock;
2680oom_free_page:
2681 page_cache_release(page);
2682oom:
2683 return VM_FAULT_OOM;
2684}
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2700 unsigned long address, pmd_t *pmd,
2701 pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
2702{
2703 pte_t *page_table;
2704 spinlock_t *ptl;
2705 struct page *page;
2706 pte_t entry;
2707 int anon = 0;
2708 int charged = 0;
2709 struct page *dirty_page = NULL;
2710 struct vm_fault vmf;
2711 int ret;
2712 int page_mkwrite = 0;
2713
2714 vmf.virtual_address = (void __user *)(address & PAGE_MASK);
2715 vmf.pgoff = pgoff;
2716 vmf.flags = flags;
2717 vmf.page = NULL;
2718
2719 ret = vma->vm_ops->fault(vma, &vmf);
2720 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
2721 return ret;
2722
2723 if (unlikely(PageHWPoison(vmf.page))) {
2724 if (ret & VM_FAULT_LOCKED)
2725 unlock_page(vmf.page);
2726 return VM_FAULT_HWPOISON;
2727 }
2728
2729
2730
2731
2732
2733 if (unlikely(!(ret & VM_FAULT_LOCKED)))
2734 lock_page(vmf.page);
2735 else
2736 VM_BUG_ON(!PageLocked(vmf.page));
2737
2738
2739
2740
2741 page = vmf.page;
2742 if (flags & FAULT_FLAG_WRITE) {
2743 if (!(vma->vm_flags & VM_SHARED)) {
2744 anon = 1;
2745 if (unlikely(anon_vma_prepare(vma))) {
2746 ret = VM_FAULT_OOM;
2747 goto out;
2748 }
2749 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
2750 vma, address);
2751 if (!page) {
2752 ret = VM_FAULT_OOM;
2753 goto out;
2754 }
2755 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
2756 ret = VM_FAULT_OOM;
2757 page_cache_release(page);
2758 goto out;
2759 }
2760 charged = 1;
2761
2762
2763
2764
2765 if (vma->vm_flags & VM_LOCKED)
2766 clear_page_mlock(vmf.page);
2767 copy_user_highpage(page, vmf.page, address, vma);
2768 __SetPageUptodate(page);
2769 } else {
2770
2771
2772
2773
2774
2775 if (vma->vm_ops->page_mkwrite) {
2776 int tmp;
2777
2778 unlock_page(page);
2779 vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
2780 tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
2781 if (unlikely(tmp &
2782 (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
2783 ret = tmp;
2784 goto unwritable_page;
2785 }
2786 if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
2787 lock_page(page);
2788 if (!page->mapping) {
2789 ret = 0;
2790 unlock_page(page);
2791 goto unwritable_page;
2792 }
2793 } else
2794 VM_BUG_ON(!PageLocked(page));
2795 page_mkwrite = 1;
2796 }
2797 }
2798
2799 }
2800
2801 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814 if (likely(pte_same(*page_table, orig_pte))) {
2815 flush_icache_page(vma, page);
2816 entry = mk_pte(page, vma->vm_page_prot);
2817 if (flags & FAULT_FLAG_WRITE)
2818 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2819 if (anon) {
2820 inc_mm_counter(mm, anon_rss);
2821 page_add_new_anon_rmap(page, vma, address);
2822 } else {
2823 inc_mm_counter(mm, file_rss);
2824 page_add_file_rmap(page);
2825 if (flags & FAULT_FLAG_WRITE) {
2826 dirty_page = page;
2827 get_page(dirty_page);
2828 }
2829 }
2830 set_pte_at(mm, address, page_table, entry);
2831
2832
2833 update_mmu_cache(vma, address, entry);
2834 } else {
2835 if (charged)
2836 mem_cgroup_uncharge_page(page);
2837 if (anon)
2838 page_cache_release(page);
2839 else
2840 anon = 1;
2841 }
2842
2843 pte_unmap_unlock(page_table, ptl);
2844
2845out:
2846 if (dirty_page) {
2847 struct address_space *mapping = page->mapping;
2848
2849 if (set_page_dirty(dirty_page))
2850 page_mkwrite = 1;
2851 unlock_page(dirty_page);
2852 put_page(dirty_page);
2853 if (page_mkwrite && mapping) {
2854
2855
2856
2857
2858 balance_dirty_pages_ratelimited(mapping);
2859 }
2860
2861
2862 if (vma->vm_file)
2863 file_update_time(vma->vm_file);
2864 } else {
2865 unlock_page(vmf.page);
2866 if (anon)
2867 page_cache_release(vmf.page);
2868 }
2869
2870 return ret;
2871
2872unwritable_page:
2873 page_cache_release(page);
2874 return ret;
2875}
2876
2877static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2878 unsigned long address, pte_t *page_table, pmd_t *pmd,
2879 unsigned int flags, pte_t orig_pte)
2880{
2881 pgoff_t pgoff = (((address & PAGE_MASK)
2882 - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2883
2884 pte_unmap(page_table);
2885 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
2886}
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2898 unsigned long address, pte_t *page_table, pmd_t *pmd,
2899 unsigned int flags, pte_t orig_pte)
2900{
2901 pgoff_t pgoff;
2902
2903 flags |= FAULT_FLAG_NONLINEAR;
2904
2905 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
2906 return 0;
2907
2908 if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
2909
2910
2911
2912 print_bad_pte(vma, address, orig_pte, NULL);
2913 return VM_FAULT_OOM;
2914 }
2915
2916 pgoff = pte_to_pgoff(orig_pte);
2917 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
2918}
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933static inline int handle_pte_fault(struct mm_struct *mm,
2934 struct vm_area_struct *vma, unsigned long address,
2935 pte_t *pte, pmd_t *pmd, unsigned int flags)
2936{
2937 pte_t entry;
2938 spinlock_t *ptl;
2939
2940 entry = *pte;
2941 if (!pte_present(entry)) {
2942 if (pte_none(entry)) {
2943 if (vma->vm_ops) {
2944 if (likely(vma->vm_ops->fault))
2945 return do_linear_fault(mm, vma, address,
2946 pte, pmd, flags, entry);
2947 }
2948 return do_anonymous_page(mm, vma, address,
2949 pte, pmd, flags);
2950 }
2951 if (pte_file(entry))
2952 return do_nonlinear_fault(mm, vma, address,
2953 pte, pmd, flags, entry);
2954 return do_swap_page(mm, vma, address,
2955 pte, pmd, flags, entry);
2956 }
2957
2958 ptl = pte_lockptr(mm, pmd);
2959 spin_lock(ptl);
2960 if (unlikely(!pte_same(*pte, entry)))
2961 goto unlock;
2962 if (flags & FAULT_FLAG_WRITE) {
2963 if (!pte_write(entry))
2964 return do_wp_page(mm, vma, address,
2965 pte, pmd, ptl, entry);
2966 entry = pte_mkdirty(entry);
2967 }
2968 entry = pte_mkyoung(entry);
2969 if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
2970 update_mmu_cache(vma, address, entry);
2971 } else {
2972
2973
2974
2975
2976
2977
2978 if (flags & FAULT_FLAG_WRITE)
2979 flush_tlb_page(vma, address);
2980 }
2981unlock:
2982 pte_unmap_unlock(pte, ptl);
2983 return 0;
2984}
2985
2986
2987
2988
2989int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2990 unsigned long address, unsigned int flags)
2991{
2992 pgd_t *pgd;
2993 pud_t *pud;
2994 pmd_t *pmd;
2995 pte_t *pte;
2996
2997 __set_current_state(TASK_RUNNING);
2998
2999 count_vm_event(PGFAULT);
3000
3001 if (unlikely(is_vm_hugetlb_page(vma)))
3002 return hugetlb_fault(mm, vma, address, flags);
3003
3004 pgd = pgd_offset(mm, address);
3005 pud = pud_alloc(mm, pgd, address);
3006 if (!pud)
3007 return VM_FAULT_OOM;
3008 pmd = pmd_alloc(mm, pud, address);
3009 if (!pmd)
3010 return VM_FAULT_OOM;
3011 pte = pte_alloc_map(mm, pmd, address);
3012 if (!pte)
3013 return VM_FAULT_OOM;
3014
3015 return handle_pte_fault(mm, vma, address, pte, pmd, flags);
3016}
3017
3018#ifndef __PAGETABLE_PUD_FOLDED
3019
3020
3021
3022
3023int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
3024{
3025 pud_t *new = pud_alloc_one(mm, address);
3026 if (!new)
3027 return -ENOMEM;
3028
3029 smp_wmb();
3030
3031 spin_lock(&mm->page_table_lock);
3032 if (pgd_present(*pgd))
3033 pud_free(mm, new);
3034 else
3035 pgd_populate(mm, pgd, new);
3036 spin_unlock(&mm->page_table_lock);
3037 return 0;
3038}
3039#endif
3040
3041#ifndef __PAGETABLE_PMD_FOLDED
3042
3043
3044
3045
3046int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
3047{
3048 pmd_t *new = pmd_alloc_one(mm, address);
3049 if (!new)
3050 return -ENOMEM;
3051
3052 smp_wmb();
3053
3054 spin_lock(&mm->page_table_lock);
3055#ifndef __ARCH_HAS_4LEVEL_HACK
3056 if (pud_present(*pud))
3057 pmd_free(mm, new);
3058 else
3059 pud_populate(mm, pud, new);
3060#else
3061 if (pgd_present(*pud))
3062 pmd_free(mm, new);
3063 else
3064 pgd_populate(mm, pud, new);
3065#endif
3066 spin_unlock(&mm->page_table_lock);
3067 return 0;
3068}
3069#endif
3070
3071int make_pages_present(unsigned long addr, unsigned long end)
3072{
3073 int ret, len, write;
3074 struct vm_area_struct * vma;
3075
3076 vma = find_vma(current->mm, addr);
3077 if (!vma)
3078 return -ENOMEM;
3079 write = (vma->vm_flags & VM_WRITE) != 0;
3080 BUG_ON(addr >= end);
3081 BUG_ON(end > vma->vm_end);
3082 len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
3083 ret = get_user_pages(current, current->mm, addr,
3084 len, write, 0, NULL, NULL);
3085 if (ret < 0)
3086 return ret;
3087 return ret == len ? 0 : -EFAULT;
3088}
3089
3090#if !defined(__HAVE_ARCH_GATE_AREA)
3091
3092#if defined(AT_SYSINFO_EHDR)
3093static struct vm_area_struct gate_vma;
3094
3095static int __init gate_vma_init(void)
3096{
3097 gate_vma.vm_mm = NULL;
3098 gate_vma.vm_start = FIXADDR_USER_START;
3099 gate_vma.vm_end = FIXADDR_USER_END;
3100 gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
3101 gate_vma.vm_page_prot = __P101;
3102
3103
3104
3105
3106
3107
3108 gate_vma.vm_flags |= VM_ALWAYSDUMP;
3109 return 0;
3110}
3111__initcall(gate_vma_init);
3112#endif
3113
3114struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
3115{
3116#ifdef AT_SYSINFO_EHDR
3117 return &gate_vma;
3118#else
3119 return NULL;
3120#endif
3121}
3122
3123int in_gate_area_no_task(unsigned long addr)
3124{
3125#ifdef AT_SYSINFO_EHDR
3126 if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END))
3127 return 1;
3128#endif
3129 return 0;
3130}
3131
3132#endif
3133
3134static int follow_pte(struct mm_struct *mm, unsigned long address,
3135 pte_t **ptepp, spinlock_t **ptlp)
3136{
3137 pgd_t *pgd;
3138 pud_t *pud;
3139 pmd_t *pmd;
3140 pte_t *ptep;
3141
3142 pgd = pgd_offset(mm, address);
3143 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
3144 goto out;
3145
3146 pud = pud_offset(pgd, address);
3147 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
3148 goto out;
3149
3150 pmd = pmd_offset(pud, address);
3151 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
3152 goto out;
3153
3154
3155 if (pmd_huge(*pmd))
3156 goto out;
3157
3158 ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
3159 if (!ptep)
3160 goto out;
3161 if (!pte_present(*ptep))
3162 goto unlock;
3163 *ptepp = ptep;
3164 return 0;
3165unlock:
3166 pte_unmap_unlock(ptep, *ptlp);
3167out:
3168 return -EINVAL;
3169}
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181int follow_pfn(struct vm_area_struct *vma, unsigned long address,
3182 unsigned long *pfn)
3183{
3184 int ret = -EINVAL;
3185 spinlock_t *ptl;
3186 pte_t *ptep;
3187
3188 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
3189 return ret;
3190
3191 ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
3192 if (ret)
3193 return ret;
3194 *pfn = pte_pfn(*ptep);
3195 pte_unmap_unlock(ptep, ptl);
3196 return 0;
3197}
3198EXPORT_SYMBOL(follow_pfn);
3199
3200#ifdef CONFIG_HAVE_IOREMAP_PROT
3201int follow_phys(struct vm_area_struct *vma,
3202 unsigned long address, unsigned int flags,
3203 unsigned long *prot, resource_size_t *phys)
3204{
3205 int ret = -EINVAL;
3206 pte_t *ptep, pte;
3207 spinlock_t *ptl;
3208
3209 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
3210 goto out;
3211
3212 if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
3213 goto out;
3214 pte = *ptep;
3215
3216 if ((flags & FOLL_WRITE) && !pte_write(pte))
3217 goto unlock;
3218
3219 *prot = pgprot_val(pte_pgprot(pte));
3220 *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
3221
3222 ret = 0;
3223unlock:
3224 pte_unmap_unlock(ptep, ptl);
3225out:
3226 return ret;
3227}
3228
3229int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
3230 void *buf, int len, int write)
3231{
3232 resource_size_t phys_addr;
3233 unsigned long prot = 0;
3234 void __iomem *maddr;
3235 int offset = addr & (PAGE_SIZE-1);
3236
3237 if (follow_phys(vma, addr, write, &prot, &phys_addr))
3238 return -EINVAL;
3239
3240 maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot);
3241 if (write)
3242 memcpy_toio(maddr + offset, buf, len);
3243 else
3244 memcpy_fromio(buf, maddr + offset, len);
3245 iounmap(maddr);
3246
3247 return len;
3248}
3249#endif
3250
3251
3252
3253
3254
3255
3256int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
3257{
3258 struct mm_struct *mm;
3259 struct vm_area_struct *vma;
3260 void *old_buf = buf;
3261
3262 mm = get_task_mm(tsk);
3263 if (!mm)
3264 return 0;
3265
3266 down_read(&mm->mmap_sem);
3267
3268 while (len) {
3269 int bytes, ret, offset;
3270 void *maddr;
3271 struct page *page = NULL;
3272
3273 ret = get_user_pages(tsk, mm, addr, 1,
3274 write, 1, &page, &vma);
3275 if (ret <= 0) {
3276
3277
3278
3279
3280#ifdef CONFIG_HAVE_IOREMAP_PROT
3281 vma = find_vma(mm, addr);
3282 if (!vma)
3283 break;
3284 if (vma->vm_ops && vma->vm_ops->access)
3285 ret = vma->vm_ops->access(vma, addr, buf,
3286 len, write);
3287 if (ret <= 0)
3288#endif
3289 break;
3290 bytes = ret;
3291 } else {
3292 bytes = len;
3293 offset = addr & (PAGE_SIZE-1);
3294 if (bytes > PAGE_SIZE-offset)
3295 bytes = PAGE_SIZE-offset;
3296
3297 maddr = kmap(page);
3298 if (write) {
3299 copy_to_user_page(vma, page, addr,
3300 maddr + offset, buf, bytes);
3301 set_page_dirty_lock(page);
3302 } else {
3303 copy_from_user_page(vma, page, addr,
3304 buf, maddr + offset, bytes);
3305 }
3306 kunmap(page);
3307 page_cache_release(page);
3308 }
3309 len -= bytes;
3310 buf += bytes;
3311 addr += bytes;
3312 }
3313 up_read(&mm->mmap_sem);
3314 mmput(mm);
3315
3316 return buf - old_buf;
3317}
3318
3319
3320
3321
3322void print_vma_addr(char *prefix, unsigned long ip)
3323{
3324 struct mm_struct *mm = current->mm;
3325 struct vm_area_struct *vma;
3326
3327
3328
3329
3330
3331 if (preempt_count())
3332 return;
3333
3334 down_read(&mm->mmap_sem);
3335 vma = find_vma(mm, ip);
3336 if (vma && vma->vm_file) {
3337 struct file *f = vma->vm_file;
3338 char *buf = (char *)__get_free_page(GFP_KERNEL);
3339 if (buf) {
3340 char *p, *s;
3341
3342 p = d_path(&f->f_path, buf, PAGE_SIZE);
3343 if (IS_ERR(p))
3344 p = "?";
3345 s = strrchr(p, '/');
3346 if (s)
3347 p = s+1;
3348 printk("%s%s[%lx+%lx]", prefix, p,
3349 vma->vm_start,
3350 vma->vm_end - vma->vm_start);
3351 free_page((unsigned long)buf);
3352 }
3353 }
3354 up_read(¤t->mm->mmap_sem);
3355}
3356
3357#ifdef CONFIG_PROVE_LOCKING
3358void might_fault(void)
3359{
3360
3361
3362
3363
3364
3365
3366 if (segment_eq(get_fs(), KERNEL_DS))
3367 return;
3368
3369 might_sleep();
3370
3371
3372
3373
3374
3375 if (!in_atomic() && current->mm)
3376 might_lock_read(¤t->mm->mmap_sem);
3377}
3378EXPORT_SYMBOL(might_fault);
3379#endif
3380