1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41#include <linux/kernel_stat.h>
42#include <linux/mm.h>
43#include <linux/sched/mm.h>
44#include <linux/sched/coredump.h>
45#include <linux/sched/numa_balancing.h>
46#include <linux/sched/task.h>
47#include <linux/hugetlb.h>
48#include <linux/mman.h>
49#include <linux/swap.h>
50#include <linux/highmem.h>
51#include <linux/pagemap.h>
52#include <linux/ksm.h>
53#include <linux/rmap.h>
54#include <linux/export.h>
55#include <linux/delayacct.h>
56#include <linux/init.h>
57#include <linux/pfn_t.h>
58#include <linux/writeback.h>
59#include <linux/memcontrol.h>
60#include <linux/mmu_notifier.h>
61#include <linux/kallsyms.h>
62#include <linux/swapops.h>
63#include <linux/elf.h>
64#include <linux/gfp.h>
65#include <linux/migrate.h>
66#include <linux/string.h>
67#include <linux/dma-debug.h>
68#include <linux/debugfs.h>
69#include <linux/userfaultfd_k.h>
70#include <linux/dax.h>
71
72#include <asm/io.h>
73#include <asm/mmu_context.h>
74#include <asm/pgalloc.h>
75#include <linux/uaccess.h>
76#include <asm/tlb.h>
77#include <asm/tlbflush.h>
78#include <asm/pgtable.h>
79
80#include "internal.h"
81
82#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
83#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
84#endif
85
86#ifndef CONFIG_NEED_MULTIPLE_NODES
87
88unsigned long max_mapnr;
89EXPORT_SYMBOL(max_mapnr);
90
91struct page *mem_map;
92EXPORT_SYMBOL(mem_map);
93#endif
94
95
96
97
98
99
100
101
102void *high_memory;
103EXPORT_SYMBOL(high_memory);
104
105
106
107
108
109
110
111int randomize_va_space __read_mostly =
112#ifdef CONFIG_COMPAT_BRK
113 1;
114#else
115 2;
116#endif
117
118static int __init disable_randmaps(char *s)
119{
120 randomize_va_space = 0;
121 return 1;
122}
123__setup("norandmaps", disable_randmaps);
124
125unsigned long zero_pfn __read_mostly;
126EXPORT_SYMBOL(zero_pfn);
127
128unsigned long highest_memmap_pfn __read_mostly;
129
130
131
132
133static int __init init_zero_pfn(void)
134{
135 zero_pfn = page_to_pfn(ZERO_PAGE(0));
136 return 0;
137}
138core_initcall(init_zero_pfn);
139
140
141#if defined(SPLIT_RSS_COUNTING)
142
143void sync_mm_rss(struct mm_struct *mm)
144{
145 int i;
146
147 for (i = 0; i < NR_MM_COUNTERS; i++) {
148 if (current->rss_stat.count[i]) {
149 add_mm_counter(mm, i, current->rss_stat.count[i]);
150 current->rss_stat.count[i] = 0;
151 }
152 }
153 current->rss_stat.events = 0;
154}
155
156static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
157{
158 struct task_struct *task = current;
159
160 if (likely(task->mm == mm))
161 task->rss_stat.count[member] += val;
162 else
163 add_mm_counter(mm, member, val);
164}
165#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
166#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
167
168
169#define TASK_RSS_EVENTS_THRESH (64)
170static void check_sync_rss_stat(struct task_struct *task)
171{
172 if (unlikely(task != current))
173 return;
174 if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
175 sync_mm_rss(task->mm);
176}
177#else
178
179#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
180#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
181
182static void check_sync_rss_stat(struct task_struct *task)
183{
184}
185
186#endif
187
188#ifdef HAVE_GENERIC_MMU_GATHER
189
190static bool tlb_next_batch(struct mmu_gather *tlb)
191{
192 struct mmu_gather_batch *batch;
193
194 batch = tlb->active;
195 if (batch->next) {
196 tlb->active = batch->next;
197 return true;
198 }
199
200 if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
201 return false;
202
203 batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
204 if (!batch)
205 return false;
206
207 tlb->batch_count++;
208 batch->next = NULL;
209 batch->nr = 0;
210 batch->max = MAX_GATHER_BATCH;
211
212 tlb->active->next = batch;
213 tlb->active = batch;
214
215 return true;
216}
217
218
219
220
221
222
223void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end)
224{
225 tlb->mm = mm;
226
227
228 tlb->fullmm = !(start | (end+1));
229 tlb->need_flush_all = 0;
230 tlb->local.next = NULL;
231 tlb->local.nr = 0;
232 tlb->local.max = ARRAY_SIZE(tlb->__pages);
233 tlb->active = &tlb->local;
234 tlb->batch_count = 0;
235
236#ifdef CONFIG_HAVE_RCU_TABLE_FREE
237 tlb->batch = NULL;
238#endif
239 tlb->page_size = 0;
240
241 __tlb_reset_range(tlb);
242}
243
244static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
245{
246 if (!tlb->end)
247 return;
248
249 tlb_flush(tlb);
250 mmu_notifier_invalidate_range(tlb->mm, tlb->start, tlb->end);
251#ifdef CONFIG_HAVE_RCU_TABLE_FREE
252 tlb_table_flush(tlb);
253#endif
254 __tlb_reset_range(tlb);
255}
256
257static void tlb_flush_mmu_free(struct mmu_gather *tlb)
258{
259 struct mmu_gather_batch *batch;
260
261 for (batch = &tlb->local; batch && batch->nr; batch = batch->next) {
262 free_pages_and_swap_cache(batch->pages, batch->nr);
263 batch->nr = 0;
264 }
265 tlb->active = &tlb->local;
266}
267
268void tlb_flush_mmu(struct mmu_gather *tlb)
269{
270 tlb_flush_mmu_tlbonly(tlb);
271 tlb_flush_mmu_free(tlb);
272}
273
274
275
276
277
278void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
279{
280 struct mmu_gather_batch *batch, *next;
281
282 tlb_flush_mmu(tlb);
283
284
285 check_pgt_cache();
286
287 for (batch = tlb->local.next; batch; batch = next) {
288 next = batch->next;
289 free_pages((unsigned long)batch, 0);
290 }
291 tlb->local.next = NULL;
292}
293
294
295
296
297
298
299
300
301bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size)
302{
303 struct mmu_gather_batch *batch;
304
305 VM_BUG_ON(!tlb->end);
306 VM_WARN_ON(tlb->page_size != page_size);
307
308 batch = tlb->active;
309
310
311
312
313 batch->pages[batch->nr++] = page;
314 if (batch->nr == batch->max) {
315 if (!tlb_next_batch(tlb))
316 return true;
317 batch = tlb->active;
318 }
319 VM_BUG_ON_PAGE(batch->nr > batch->max, page);
320
321 return false;
322}
323
324#endif
325
326#ifdef CONFIG_HAVE_RCU_TABLE_FREE
327
328
329
330
331
332static void tlb_remove_table_smp_sync(void *arg)
333{
334
335}
336
337static void tlb_remove_table_one(void *table)
338{
339
340
341
342
343
344
345
346 smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
347 __tlb_remove_table(table);
348}
349
350static void tlb_remove_table_rcu(struct rcu_head *head)
351{
352 struct mmu_table_batch *batch;
353 int i;
354
355 batch = container_of(head, struct mmu_table_batch, rcu);
356
357 for (i = 0; i < batch->nr; i++)
358 __tlb_remove_table(batch->tables[i]);
359
360 free_page((unsigned long)batch);
361}
362
363void tlb_table_flush(struct mmu_gather *tlb)
364{
365 struct mmu_table_batch **batch = &tlb->batch;
366
367 if (*batch) {
368 call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
369 *batch = NULL;
370 }
371}
372
373void tlb_remove_table(struct mmu_gather *tlb, void *table)
374{
375 struct mmu_table_batch **batch = &tlb->batch;
376
377
378
379
380
381 if (atomic_read(&tlb->mm->mm_users) < 2) {
382 __tlb_remove_table(table);
383 return;
384 }
385
386 if (*batch == NULL) {
387 *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
388 if (*batch == NULL) {
389 tlb_remove_table_one(table);
390 return;
391 }
392 (*batch)->nr = 0;
393 }
394 (*batch)->tables[(*batch)->nr++] = table;
395 if ((*batch)->nr == MAX_TABLE_BATCH)
396 tlb_table_flush(tlb);
397}
398
399#endif
400
401
402
403
404
405static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
406 unsigned long addr)
407{
408 pgtable_t token = pmd_pgtable(*pmd);
409 pmd_clear(pmd);
410 pte_free_tlb(tlb, token, addr);
411 atomic_long_dec(&tlb->mm->nr_ptes);
412}
413
414static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
415 unsigned long addr, unsigned long end,
416 unsigned long floor, unsigned long ceiling)
417{
418 pmd_t *pmd;
419 unsigned long next;
420 unsigned long start;
421
422 start = addr;
423 pmd = pmd_offset(pud, addr);
424 do {
425 next = pmd_addr_end(addr, end);
426 if (pmd_none_or_clear_bad(pmd))
427 continue;
428 free_pte_range(tlb, pmd, addr);
429 } while (pmd++, addr = next, addr != end);
430
431 start &= PUD_MASK;
432 if (start < floor)
433 return;
434 if (ceiling) {
435 ceiling &= PUD_MASK;
436 if (!ceiling)
437 return;
438 }
439 if (end - 1 > ceiling - 1)
440 return;
441
442 pmd = pmd_offset(pud, start);
443 pud_clear(pud);
444 pmd_free_tlb(tlb, pmd, start);
445 mm_dec_nr_pmds(tlb->mm);
446}
447
448static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
449 unsigned long addr, unsigned long end,
450 unsigned long floor, unsigned long ceiling)
451{
452 pud_t *pud;
453 unsigned long next;
454 unsigned long start;
455
456 start = addr;
457 pud = pud_offset(p4d, addr);
458 do {
459 next = pud_addr_end(addr, end);
460 if (pud_none_or_clear_bad(pud))
461 continue;
462 free_pmd_range(tlb, pud, addr, next, floor, ceiling);
463 } while (pud++, addr = next, addr != end);
464
465 start &= P4D_MASK;
466 if (start < floor)
467 return;
468 if (ceiling) {
469 ceiling &= P4D_MASK;
470 if (!ceiling)
471 return;
472 }
473 if (end - 1 > ceiling - 1)
474 return;
475
476 pud = pud_offset(p4d, start);
477 p4d_clear(p4d);
478 pud_free_tlb(tlb, pud, start);
479}
480
481static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
482 unsigned long addr, unsigned long end,
483 unsigned long floor, unsigned long ceiling)
484{
485 p4d_t *p4d;
486 unsigned long next;
487 unsigned long start;
488
489 start = addr;
490 p4d = p4d_offset(pgd, addr);
491 do {
492 next = p4d_addr_end(addr, end);
493 if (p4d_none_or_clear_bad(p4d))
494 continue;
495 free_pud_range(tlb, p4d, addr, next, floor, ceiling);
496 } while (p4d++, addr = next, addr != end);
497
498 start &= PGDIR_MASK;
499 if (start < floor)
500 return;
501 if (ceiling) {
502 ceiling &= PGDIR_MASK;
503 if (!ceiling)
504 return;
505 }
506 if (end - 1 > ceiling - 1)
507 return;
508
509 p4d = p4d_offset(pgd, start);
510 pgd_clear(pgd);
511 p4d_free_tlb(tlb, p4d, start);
512}
513
514
515
516
517void free_pgd_range(struct mmu_gather *tlb,
518 unsigned long addr, unsigned long end,
519 unsigned long floor, unsigned long ceiling)
520{
521 pgd_t *pgd;
522 unsigned long next;
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550 addr &= PMD_MASK;
551 if (addr < floor) {
552 addr += PMD_SIZE;
553 if (!addr)
554 return;
555 }
556 if (ceiling) {
557 ceiling &= PMD_MASK;
558 if (!ceiling)
559 return;
560 }
561 if (end - 1 > ceiling - 1)
562 end -= PMD_SIZE;
563 if (addr > end - 1)
564 return;
565
566
567
568
569 tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
570 pgd = pgd_offset(tlb->mm, addr);
571 do {
572 next = pgd_addr_end(addr, end);
573 if (pgd_none_or_clear_bad(pgd))
574 continue;
575 free_p4d_range(tlb, pgd, addr, next, floor, ceiling);
576 } while (pgd++, addr = next, addr != end);
577}
578
579void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
580 unsigned long floor, unsigned long ceiling)
581{
582 while (vma) {
583 struct vm_area_struct *next = vma->vm_next;
584 unsigned long addr = vma->vm_start;
585
586
587
588
589
590 unlink_anon_vmas(vma);
591 unlink_file_vma(vma);
592
593 if (is_vm_hugetlb_page(vma)) {
594 hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
595 floor, next ? next->vm_start : ceiling);
596 } else {
597
598
599
600 while (next && next->vm_start <= vma->vm_end + PMD_SIZE
601 && !is_vm_hugetlb_page(next)) {
602 vma = next;
603 next = vma->vm_next;
604 unlink_anon_vmas(vma);
605 unlink_file_vma(vma);
606 }
607 free_pgd_range(tlb, addr, vma->vm_end,
608 floor, next ? next->vm_start : ceiling);
609 }
610 vma = next;
611 }
612}
613
614int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
615{
616 spinlock_t *ptl;
617 pgtable_t new = pte_alloc_one(mm, address);
618 if (!new)
619 return -ENOMEM;
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634 smp_wmb();
635
636 ptl = pmd_lock(mm, pmd);
637 if (likely(pmd_none(*pmd))) {
638 atomic_long_inc(&mm->nr_ptes);
639 pmd_populate(mm, pmd, new);
640 new = NULL;
641 }
642 spin_unlock(ptl);
643 if (new)
644 pte_free(mm, new);
645 return 0;
646}
647
648int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
649{
650 pte_t *new = pte_alloc_one_kernel(&init_mm, address);
651 if (!new)
652 return -ENOMEM;
653
654 smp_wmb();
655
656 spin_lock(&init_mm.page_table_lock);
657 if (likely(pmd_none(*pmd))) {
658 pmd_populate_kernel(&init_mm, pmd, new);
659 new = NULL;
660 }
661 spin_unlock(&init_mm.page_table_lock);
662 if (new)
663 pte_free_kernel(&init_mm, new);
664 return 0;
665}
666
667static inline void init_rss_vec(int *rss)
668{
669 memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
670}
671
672static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
673{
674 int i;
675
676 if (current->mm == mm)
677 sync_mm_rss(mm);
678 for (i = 0; i < NR_MM_COUNTERS; i++)
679 if (rss[i])
680 add_mm_counter(mm, i, rss[i]);
681}
682
683
684
685
686
687
688
689
690static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
691 pte_t pte, struct page *page)
692{
693 pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
694 p4d_t *p4d = p4d_offset(pgd, addr);
695 pud_t *pud = pud_offset(p4d, addr);
696 pmd_t *pmd = pmd_offset(pud, addr);
697 struct address_space *mapping;
698 pgoff_t index;
699 static unsigned long resume;
700 static unsigned long nr_shown;
701 static unsigned long nr_unshown;
702
703
704
705
706
707 if (nr_shown == 60) {
708 if (time_before(jiffies, resume)) {
709 nr_unshown++;
710 return;
711 }
712 if (nr_unshown) {
713 pr_alert("BUG: Bad page map: %lu messages suppressed\n",
714 nr_unshown);
715 nr_unshown = 0;
716 }
717 nr_shown = 0;
718 }
719 if (nr_shown++ == 0)
720 resume = jiffies + 60 * HZ;
721
722 mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
723 index = linear_page_index(vma, addr);
724
725 pr_alert("BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
726 current->comm,
727 (long long)pte_val(pte), (long long)pmd_val(*pmd));
728 if (page)
729 dump_page(page, "bad pte");
730 pr_alert("addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
731 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
732
733
734
735 pr_alert("file:%pD fault:%pf mmap:%pf readpage:%pf\n",
736 vma->vm_file,
737 vma->vm_ops ? vma->vm_ops->fault : NULL,
738 vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
739 mapping ? mapping->a_ops->readpage : NULL);
740 dump_stack();
741 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
742}
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786#ifdef __HAVE_ARCH_PTE_SPECIAL
787# define HAVE_PTE_SPECIAL 1
788#else
789# define HAVE_PTE_SPECIAL 0
790#endif
791struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
792 pte_t pte)
793{
794 unsigned long pfn = pte_pfn(pte);
795
796 if (HAVE_PTE_SPECIAL) {
797 if (likely(!pte_special(pte)))
798 goto check_pfn;
799 if (vma->vm_ops && vma->vm_ops->find_special_page)
800 return vma->vm_ops->find_special_page(vma, addr);
801 if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
802 return NULL;
803 if (!is_zero_pfn(pfn))
804 print_bad_pte(vma, addr, pte, NULL);
805 return NULL;
806 }
807
808
809
810 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
811 if (vma->vm_flags & VM_MIXEDMAP) {
812 if (!pfn_valid(pfn))
813 return NULL;
814 goto out;
815 } else {
816 unsigned long off;
817 off = (addr - vma->vm_start) >> PAGE_SHIFT;
818 if (pfn == vma->vm_pgoff + off)
819 return NULL;
820 if (!is_cow_mapping(vma->vm_flags))
821 return NULL;
822 }
823 }
824
825 if (is_zero_pfn(pfn))
826 return NULL;
827check_pfn:
828 if (unlikely(pfn > highest_memmap_pfn)) {
829 print_bad_pte(vma, addr, pte, NULL);
830 return NULL;
831 }
832
833
834
835
836
837out:
838 return pfn_to_page(pfn);
839}
840
841#ifdef CONFIG_TRANSPARENT_HUGEPAGE
842struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
843 pmd_t pmd)
844{
845 unsigned long pfn = pmd_pfn(pmd);
846
847
848
849
850
851
852 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
853 if (vma->vm_flags & VM_MIXEDMAP) {
854 if (!pfn_valid(pfn))
855 return NULL;
856 goto out;
857 } else {
858 unsigned long off;
859 off = (addr - vma->vm_start) >> PAGE_SHIFT;
860 if (pfn == vma->vm_pgoff + off)
861 return NULL;
862 if (!is_cow_mapping(vma->vm_flags))
863 return NULL;
864 }
865 }
866
867 if (is_zero_pfn(pfn))
868 return NULL;
869 if (unlikely(pfn > highest_memmap_pfn))
870 return NULL;
871
872
873
874
875
876out:
877 return pfn_to_page(pfn);
878}
879#endif
880
881
882
883
884
885
886
887static inline unsigned long
888copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
889 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
890 unsigned long addr, int *rss)
891{
892 unsigned long vm_flags = vma->vm_flags;
893 pte_t pte = *src_pte;
894 struct page *page;
895
896
897 if (unlikely(!pte_present(pte))) {
898 swp_entry_t entry = pte_to_swp_entry(pte);
899
900 if (likely(!non_swap_entry(entry))) {
901 if (swap_duplicate(entry) < 0)
902 return entry.val;
903
904
905 if (unlikely(list_empty(&dst_mm->mmlist))) {
906 spin_lock(&mmlist_lock);
907 if (list_empty(&dst_mm->mmlist))
908 list_add(&dst_mm->mmlist,
909 &src_mm->mmlist);
910 spin_unlock(&mmlist_lock);
911 }
912 rss[MM_SWAPENTS]++;
913 } else if (is_migration_entry(entry)) {
914 page = migration_entry_to_page(entry);
915
916 rss[mm_counter(page)]++;
917
918 if (is_write_migration_entry(entry) &&
919 is_cow_mapping(vm_flags)) {
920
921
922
923
924 make_migration_entry_read(&entry);
925 pte = swp_entry_to_pte(entry);
926 if (pte_swp_soft_dirty(*src_pte))
927 pte = pte_swp_mksoft_dirty(pte);
928 set_pte_at(src_mm, addr, src_pte, pte);
929 }
930 }
931 goto out_set_pte;
932 }
933
934
935
936
937
938 if (is_cow_mapping(vm_flags)) {
939 ptep_set_wrprotect(src_mm, addr, src_pte);
940 pte = pte_wrprotect(pte);
941 }
942
943
944
945
946
947 if (vm_flags & VM_SHARED)
948 pte = pte_mkclean(pte);
949 pte = pte_mkold(pte);
950
951 page = vm_normal_page(vma, addr, pte);
952 if (page) {
953 get_page(page);
954 page_dup_rmap(page, false);
955 rss[mm_counter(page)]++;
956 }
957
958out_set_pte:
959 set_pte_at(dst_mm, addr, dst_pte, pte);
960 return 0;
961}
962
963static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
964 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
965 unsigned long addr, unsigned long end)
966{
967 pte_t *orig_src_pte, *orig_dst_pte;
968 pte_t *src_pte, *dst_pte;
969 spinlock_t *src_ptl, *dst_ptl;
970 int progress = 0;
971 int rss[NR_MM_COUNTERS];
972 swp_entry_t entry = (swp_entry_t){0};
973
974again:
975 init_rss_vec(rss);
976
977 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
978 if (!dst_pte)
979 return -ENOMEM;
980 src_pte = pte_offset_map(src_pmd, addr);
981 src_ptl = pte_lockptr(src_mm, src_pmd);
982 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
983 orig_src_pte = src_pte;
984 orig_dst_pte = dst_pte;
985 arch_enter_lazy_mmu_mode();
986
987 do {
988
989
990
991
992 if (progress >= 32) {
993 progress = 0;
994 if (need_resched() ||
995 spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
996 break;
997 }
998 if (pte_none(*src_pte)) {
999 progress++;
1000 continue;
1001 }
1002 entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
1003 vma, addr, rss);
1004 if (entry.val)
1005 break;
1006 progress += 8;
1007 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
1008
1009 arch_leave_lazy_mmu_mode();
1010 spin_unlock(src_ptl);
1011 pte_unmap(orig_src_pte);
1012 add_mm_rss_vec(dst_mm, rss);
1013 pte_unmap_unlock(orig_dst_pte, dst_ptl);
1014 cond_resched();
1015
1016 if (entry.val) {
1017 if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
1018 return -ENOMEM;
1019 progress = 0;
1020 }
1021 if (addr != end)
1022 goto again;
1023 return 0;
1024}
1025
1026static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1027 pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
1028 unsigned long addr, unsigned long end)
1029{
1030 pmd_t *src_pmd, *dst_pmd;
1031 unsigned long next;
1032
1033 dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
1034 if (!dst_pmd)
1035 return -ENOMEM;
1036 src_pmd = pmd_offset(src_pud, addr);
1037 do {
1038 next = pmd_addr_end(addr, end);
1039 if (pmd_trans_huge(*src_pmd) || pmd_devmap(*src_pmd)) {
1040 int err;
1041 VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, vma);
1042 err = copy_huge_pmd(dst_mm, src_mm,
1043 dst_pmd, src_pmd, addr, vma);
1044 if (err == -ENOMEM)
1045 return -ENOMEM;
1046 if (!err)
1047 continue;
1048
1049 }
1050 if (pmd_none_or_clear_bad(src_pmd))
1051 continue;
1052 if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
1053 vma, addr, next))
1054 return -ENOMEM;
1055 } while (dst_pmd++, src_pmd++, addr = next, addr != end);
1056 return 0;
1057}
1058
1059static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1060 p4d_t *dst_p4d, p4d_t *src_p4d, struct vm_area_struct *vma,
1061 unsigned long addr, unsigned long end)
1062{
1063 pud_t *src_pud, *dst_pud;
1064 unsigned long next;
1065
1066 dst_pud = pud_alloc(dst_mm, dst_p4d, addr);
1067 if (!dst_pud)
1068 return -ENOMEM;
1069 src_pud = pud_offset(src_p4d, addr);
1070 do {
1071 next = pud_addr_end(addr, end);
1072 if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) {
1073 int err;
1074
1075 VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, vma);
1076 err = copy_huge_pud(dst_mm, src_mm,
1077 dst_pud, src_pud, addr, vma);
1078 if (err == -ENOMEM)
1079 return -ENOMEM;
1080 if (!err)
1081 continue;
1082
1083 }
1084 if (pud_none_or_clear_bad(src_pud))
1085 continue;
1086 if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
1087 vma, addr, next))
1088 return -ENOMEM;
1089 } while (dst_pud++, src_pud++, addr = next, addr != end);
1090 return 0;
1091}
1092
1093static inline int copy_p4d_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1094 pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
1095 unsigned long addr, unsigned long end)
1096{
1097 p4d_t *src_p4d, *dst_p4d;
1098 unsigned long next;
1099
1100 dst_p4d = p4d_alloc(dst_mm, dst_pgd, addr);
1101 if (!dst_p4d)
1102 return -ENOMEM;
1103 src_p4d = p4d_offset(src_pgd, addr);
1104 do {
1105 next = p4d_addr_end(addr, end);
1106 if (p4d_none_or_clear_bad(src_p4d))
1107 continue;
1108 if (copy_pud_range(dst_mm, src_mm, dst_p4d, src_p4d,
1109 vma, addr, next))
1110 return -ENOMEM;
1111 } while (dst_p4d++, src_p4d++, addr = next, addr != end);
1112 return 0;
1113}
1114
1115int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1116 struct vm_area_struct *vma)
1117{
1118 pgd_t *src_pgd, *dst_pgd;
1119 unsigned long next;
1120 unsigned long addr = vma->vm_start;
1121 unsigned long end = vma->vm_end;
1122 unsigned long mmun_start;
1123 unsigned long mmun_end;
1124 bool is_cow;
1125 int ret;
1126
1127
1128
1129
1130
1131
1132
1133 if (!(vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) &&
1134 !vma->anon_vma)
1135 return 0;
1136
1137 if (is_vm_hugetlb_page(vma))
1138 return copy_hugetlb_page_range(dst_mm, src_mm, vma);
1139
1140 if (unlikely(vma->vm_flags & VM_PFNMAP)) {
1141
1142
1143
1144
1145 ret = track_pfn_copy(vma);
1146 if (ret)
1147 return ret;
1148 }
1149
1150
1151
1152
1153
1154
1155
1156 is_cow = is_cow_mapping(vma->vm_flags);
1157 mmun_start = addr;
1158 mmun_end = end;
1159 if (is_cow)
1160 mmu_notifier_invalidate_range_start(src_mm, mmun_start,
1161 mmun_end);
1162
1163 ret = 0;
1164 dst_pgd = pgd_offset(dst_mm, addr);
1165 src_pgd = pgd_offset(src_mm, addr);
1166 do {
1167 next = pgd_addr_end(addr, end);
1168 if (pgd_none_or_clear_bad(src_pgd))
1169 continue;
1170 if (unlikely(copy_p4d_range(dst_mm, src_mm, dst_pgd, src_pgd,
1171 vma, addr, next))) {
1172 ret = -ENOMEM;
1173 break;
1174 }
1175 } while (dst_pgd++, src_pgd++, addr = next, addr != end);
1176
1177 if (is_cow)
1178 mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end);
1179 return ret;
1180}
1181
1182static unsigned long zap_pte_range(struct mmu_gather *tlb,
1183 struct vm_area_struct *vma, pmd_t *pmd,
1184 unsigned long addr, unsigned long end,
1185 struct zap_details *details)
1186{
1187 struct mm_struct *mm = tlb->mm;
1188 int force_flush = 0;
1189 int rss[NR_MM_COUNTERS];
1190 spinlock_t *ptl;
1191 pte_t *start_pte;
1192 pte_t *pte;
1193 swp_entry_t entry;
1194
1195 tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
1196again:
1197 init_rss_vec(rss);
1198 start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
1199 pte = start_pte;
1200 arch_enter_lazy_mmu_mode();
1201 do {
1202 pte_t ptent = *pte;
1203 if (pte_none(ptent))
1204 continue;
1205
1206 if (pte_present(ptent)) {
1207 struct page *page;
1208
1209 page = vm_normal_page(vma, addr, ptent);
1210 if (unlikely(details) && page) {
1211
1212
1213
1214
1215
1216 if (details->check_mapping &&
1217 details->check_mapping != page_rmapping(page))
1218 continue;
1219 }
1220 ptent = ptep_get_and_clear_full(mm, addr, pte,
1221 tlb->fullmm);
1222 tlb_remove_tlb_entry(tlb, pte, addr);
1223 if (unlikely(!page))
1224 continue;
1225
1226 if (!PageAnon(page)) {
1227 if (pte_dirty(ptent)) {
1228 force_flush = 1;
1229 set_page_dirty(page);
1230 }
1231 if (pte_young(ptent) &&
1232 likely(!(vma->vm_flags & VM_SEQ_READ)))
1233 mark_page_accessed(page);
1234 }
1235 rss[mm_counter(page)]--;
1236 page_remove_rmap(page, false);
1237 if (unlikely(page_mapcount(page) < 0))
1238 print_bad_pte(vma, addr, ptent, page);
1239 if (unlikely(__tlb_remove_page(tlb, page))) {
1240 force_flush = 1;
1241 addr += PAGE_SIZE;
1242 break;
1243 }
1244 continue;
1245 }
1246
1247 if (unlikely(details))
1248 continue;
1249
1250 entry = pte_to_swp_entry(ptent);
1251 if (!non_swap_entry(entry))
1252 rss[MM_SWAPENTS]--;
1253 else if (is_migration_entry(entry)) {
1254 struct page *page;
1255
1256 page = migration_entry_to_page(entry);
1257 rss[mm_counter(page)]--;
1258 }
1259 if (unlikely(!free_swap_and_cache(entry)))
1260 print_bad_pte(vma, addr, ptent, NULL);
1261 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
1262 } while (pte++, addr += PAGE_SIZE, addr != end);
1263
1264 add_mm_rss_vec(mm, rss);
1265 arch_leave_lazy_mmu_mode();
1266
1267
1268 if (force_flush)
1269 tlb_flush_mmu_tlbonly(tlb);
1270 pte_unmap_unlock(start_pte, ptl);
1271
1272
1273
1274
1275
1276
1277
1278 if (force_flush) {
1279 force_flush = 0;
1280 tlb_flush_mmu_free(tlb);
1281 if (addr != end)
1282 goto again;
1283 }
1284
1285 return addr;
1286}
1287
1288static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1289 struct vm_area_struct *vma, pud_t *pud,
1290 unsigned long addr, unsigned long end,
1291 struct zap_details *details)
1292{
1293 pmd_t *pmd;
1294 unsigned long next;
1295
1296 pmd = pmd_offset(pud, addr);
1297 do {
1298 next = pmd_addr_end(addr, end);
1299 if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
1300 if (next - addr != HPAGE_PMD_SIZE) {
1301 VM_BUG_ON_VMA(vma_is_anonymous(vma) &&
1302 !rwsem_is_locked(&tlb->mm->mmap_sem), vma);
1303 __split_huge_pmd(vma, pmd, addr, false, NULL);
1304 } else if (zap_huge_pmd(tlb, vma, pmd, addr))
1305 goto next;
1306
1307 }
1308
1309
1310
1311
1312
1313
1314
1315 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
1316 goto next;
1317 next = zap_pte_range(tlb, vma, pmd, addr, next, details);
1318next:
1319 cond_resched();
1320 } while (pmd++, addr = next, addr != end);
1321
1322 return addr;
1323}
1324
1325static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1326 struct vm_area_struct *vma, p4d_t *p4d,
1327 unsigned long addr, unsigned long end,
1328 struct zap_details *details)
1329{
1330 pud_t *pud;
1331 unsigned long next;
1332
1333 pud = pud_offset(p4d, addr);
1334 do {
1335 next = pud_addr_end(addr, end);
1336 if (pud_trans_huge(*pud) || pud_devmap(*pud)) {
1337 if (next - addr != HPAGE_PUD_SIZE) {
1338 VM_BUG_ON_VMA(!rwsem_is_locked(&tlb->mm->mmap_sem), vma);
1339 split_huge_pud(vma, pud, addr);
1340 } else if (zap_huge_pud(tlb, vma, pud, addr))
1341 goto next;
1342
1343 }
1344 if (pud_none_or_clear_bad(pud))
1345 continue;
1346 next = zap_pmd_range(tlb, vma, pud, addr, next, details);
1347next:
1348 cond_resched();
1349 } while (pud++, addr = next, addr != end);
1350
1351 return addr;
1352}
1353
1354static inline unsigned long zap_p4d_range(struct mmu_gather *tlb,
1355 struct vm_area_struct *vma, pgd_t *pgd,
1356 unsigned long addr, unsigned long end,
1357 struct zap_details *details)
1358{
1359 p4d_t *p4d;
1360 unsigned long next;
1361
1362 p4d = p4d_offset(pgd, addr);
1363 do {
1364 next = p4d_addr_end(addr, end);
1365 if (p4d_none_or_clear_bad(p4d))
1366 continue;
1367 next = zap_pud_range(tlb, vma, p4d, addr, next, details);
1368 } while (p4d++, addr = next, addr != end);
1369
1370 return addr;
1371}
1372
1373void unmap_page_range(struct mmu_gather *tlb,
1374 struct vm_area_struct *vma,
1375 unsigned long addr, unsigned long end,
1376 struct zap_details *details)
1377{
1378 pgd_t *pgd;
1379 unsigned long next;
1380
1381 BUG_ON(addr >= end);
1382 tlb_start_vma(tlb, vma);
1383 pgd = pgd_offset(vma->vm_mm, addr);
1384 do {
1385 next = pgd_addr_end(addr, end);
1386 if (pgd_none_or_clear_bad(pgd))
1387 continue;
1388 next = zap_p4d_range(tlb, vma, pgd, addr, next, details);
1389 } while (pgd++, addr = next, addr != end);
1390 tlb_end_vma(tlb, vma);
1391}
1392
1393
1394static void unmap_single_vma(struct mmu_gather *tlb,
1395 struct vm_area_struct *vma, unsigned long start_addr,
1396 unsigned long end_addr,
1397 struct zap_details *details)
1398{
1399 unsigned long start = max(vma->vm_start, start_addr);
1400 unsigned long end;
1401
1402 if (start >= vma->vm_end)
1403 return;
1404 end = min(vma->vm_end, end_addr);
1405 if (end <= vma->vm_start)
1406 return;
1407
1408 if (vma->vm_file)
1409 uprobe_munmap(vma, start, end);
1410
1411 if (unlikely(vma->vm_flags & VM_PFNMAP))
1412 untrack_pfn(vma, 0, 0);
1413
1414 if (start != end) {
1415 if (unlikely(is_vm_hugetlb_page(vma))) {
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427 if (vma->vm_file) {
1428 i_mmap_lock_write(vma->vm_file->f_mapping);
1429 __unmap_hugepage_range_final(tlb, vma, start, end, NULL);
1430 i_mmap_unlock_write(vma->vm_file->f_mapping);
1431 }
1432 } else
1433 unmap_page_range(tlb, vma, start, end, details);
1434 }
1435}
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455void unmap_vmas(struct mmu_gather *tlb,
1456 struct vm_area_struct *vma, unsigned long start_addr,
1457 unsigned long end_addr)
1458{
1459 struct mm_struct *mm = vma->vm_mm;
1460
1461 mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
1462 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
1463 unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
1464 mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
1465}
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475void zap_page_range(struct vm_area_struct *vma, unsigned long start,
1476 unsigned long size)
1477{
1478 struct mm_struct *mm = vma->vm_mm;
1479 struct mmu_gather tlb;
1480 unsigned long end = start + size;
1481
1482 lru_add_drain();
1483 tlb_gather_mmu(&tlb, mm, start, end);
1484 update_hiwater_rss(mm);
1485 mmu_notifier_invalidate_range_start(mm, start, end);
1486 for ( ; vma && vma->vm_start < end; vma = vma->vm_next)
1487 unmap_single_vma(&tlb, vma, start, end, NULL);
1488 mmu_notifier_invalidate_range_end(mm, start, end);
1489 tlb_finish_mmu(&tlb, start, end);
1490}
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
1502 unsigned long size, struct zap_details *details)
1503{
1504 struct mm_struct *mm = vma->vm_mm;
1505 struct mmu_gather tlb;
1506 unsigned long end = address + size;
1507
1508 lru_add_drain();
1509 tlb_gather_mmu(&tlb, mm, address, end);
1510 update_hiwater_rss(mm);
1511 mmu_notifier_invalidate_range_start(mm, address, end);
1512 unmap_single_vma(&tlb, vma, address, end, details);
1513 mmu_notifier_invalidate_range_end(mm, address, end);
1514 tlb_finish_mmu(&tlb, address, end);
1515}
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
1530 unsigned long size)
1531{
1532 if (address < vma->vm_start || address + size > vma->vm_end ||
1533 !(vma->vm_flags & VM_PFNMAP))
1534 return -1;
1535 zap_page_range_single(vma, address, size, NULL);
1536 return 0;
1537}
1538EXPORT_SYMBOL_GPL(zap_vma_ptes);
1539
1540pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
1541 spinlock_t **ptl)
1542{
1543 pgd_t *pgd;
1544 p4d_t *p4d;
1545 pud_t *pud;
1546 pmd_t *pmd;
1547
1548 pgd = pgd_offset(mm, addr);
1549 p4d = p4d_alloc(mm, pgd, addr);
1550 if (!p4d)
1551 return NULL;
1552 pud = pud_alloc(mm, p4d, addr);
1553 if (!pud)
1554 return NULL;
1555 pmd = pmd_alloc(mm, pud, addr);
1556 if (!pmd)
1557 return NULL;
1558
1559 VM_BUG_ON(pmd_trans_huge(*pmd));
1560 return pte_alloc_map_lock(mm, pmd, addr, ptl);
1561}
1562
1563
1564
1565
1566
1567
1568
1569
1570static int insert_page(struct vm_area_struct *vma, unsigned long addr,
1571 struct page *page, pgprot_t prot)
1572{
1573 struct mm_struct *mm = vma->vm_mm;
1574 int retval;
1575 pte_t *pte;
1576 spinlock_t *ptl;
1577
1578 retval = -EINVAL;
1579 if (PageAnon(page))
1580 goto out;
1581 retval = -ENOMEM;
1582 flush_dcache_page(page);
1583 pte = get_locked_pte(mm, addr, &ptl);
1584 if (!pte)
1585 goto out;
1586 retval = -EBUSY;
1587 if (!pte_none(*pte))
1588 goto out_unlock;
1589
1590
1591 get_page(page);
1592 inc_mm_counter_fast(mm, mm_counter_file(page));
1593 page_add_file_rmap(page, false);
1594 set_pte_at(mm, addr, pte, mk_pte(page, prot));
1595
1596 retval = 0;
1597 pte_unmap_unlock(pte, ptl);
1598 return retval;
1599out_unlock:
1600 pte_unmap_unlock(pte, ptl);
1601out:
1602 return retval;
1603}
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
1633 struct page *page)
1634{
1635 if (addr < vma->vm_start || addr >= vma->vm_end)
1636 return -EFAULT;
1637 if (!page_count(page))
1638 return -EINVAL;
1639 if (!(vma->vm_flags & VM_MIXEDMAP)) {
1640 BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem));
1641 BUG_ON(vma->vm_flags & VM_PFNMAP);
1642 vma->vm_flags |= VM_MIXEDMAP;
1643 }
1644 return insert_page(vma, addr, page, vma->vm_page_prot);
1645}
1646EXPORT_SYMBOL(vm_insert_page);
1647
1648static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1649 pfn_t pfn, pgprot_t prot)
1650{
1651 struct mm_struct *mm = vma->vm_mm;
1652 int retval;
1653 pte_t *pte, entry;
1654 spinlock_t *ptl;
1655
1656 retval = -ENOMEM;
1657 pte = get_locked_pte(mm, addr, &ptl);
1658 if (!pte)
1659 goto out;
1660 retval = -EBUSY;
1661 if (!pte_none(*pte))
1662 goto out_unlock;
1663
1664
1665 if (pfn_t_devmap(pfn))
1666 entry = pte_mkdevmap(pfn_t_pte(pfn, prot));
1667 else
1668 entry = pte_mkspecial(pfn_t_pte(pfn, prot));
1669 set_pte_at(mm, addr, pte, entry);
1670 update_mmu_cache(vma, addr, pte);
1671
1672 retval = 0;
1673out_unlock:
1674 pte_unmap_unlock(pte, ptl);
1675out:
1676 return retval;
1677}
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1697 unsigned long pfn)
1698{
1699 return vm_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
1700}
1701EXPORT_SYMBOL(vm_insert_pfn);
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
1719 unsigned long pfn, pgprot_t pgprot)
1720{
1721 int ret;
1722
1723
1724
1725
1726
1727
1728 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
1729 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
1730 (VM_PFNMAP|VM_MIXEDMAP));
1731 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
1732 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
1733
1734 if (addr < vma->vm_start || addr >= vma->vm_end)
1735 return -EFAULT;
1736
1737 track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
1738
1739 ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot);
1740
1741 return ret;
1742}
1743EXPORT_SYMBOL(vm_insert_pfn_prot);
1744
1745int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
1746 pfn_t pfn)
1747{
1748 pgprot_t pgprot = vma->vm_page_prot;
1749
1750 BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
1751
1752 if (addr < vma->vm_start || addr >= vma->vm_end)
1753 return -EFAULT;
1754
1755 track_pfn_insert(vma, &pgprot, pfn);
1756
1757
1758
1759
1760
1761
1762
1763
1764 if (!HAVE_PTE_SPECIAL && !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) {
1765 struct page *page;
1766
1767
1768
1769
1770
1771
1772 page = pfn_to_page(pfn_t_to_pfn(pfn));
1773 return insert_page(vma, addr, page, pgprot);
1774 }
1775 return insert_pfn(vma, addr, pfn, pgprot);
1776}
1777EXPORT_SYMBOL(vm_insert_mixed);
1778
1779
1780
1781
1782
1783
1784static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1785 unsigned long addr, unsigned long end,
1786 unsigned long pfn, pgprot_t prot)
1787{
1788 pte_t *pte;
1789 spinlock_t *ptl;
1790
1791 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
1792 if (!pte)
1793 return -ENOMEM;
1794 arch_enter_lazy_mmu_mode();
1795 do {
1796 BUG_ON(!pte_none(*pte));
1797 set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
1798 pfn++;
1799 } while (pte++, addr += PAGE_SIZE, addr != end);
1800 arch_leave_lazy_mmu_mode();
1801 pte_unmap_unlock(pte - 1, ptl);
1802 return 0;
1803}
1804
1805static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
1806 unsigned long addr, unsigned long end,
1807 unsigned long pfn, pgprot_t prot)
1808{
1809 pmd_t *pmd;
1810 unsigned long next;
1811
1812 pfn -= addr >> PAGE_SHIFT;
1813 pmd = pmd_alloc(mm, pud, addr);
1814 if (!pmd)
1815 return -ENOMEM;
1816 VM_BUG_ON(pmd_trans_huge(*pmd));
1817 do {
1818 next = pmd_addr_end(addr, end);
1819 if (remap_pte_range(mm, pmd, addr, next,
1820 pfn + (addr >> PAGE_SHIFT), prot))
1821 return -ENOMEM;
1822 } while (pmd++, addr = next, addr != end);
1823 return 0;
1824}
1825
1826static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d,
1827 unsigned long addr, unsigned long end,
1828 unsigned long pfn, pgprot_t prot)
1829{
1830 pud_t *pud;
1831 unsigned long next;
1832
1833 pfn -= addr >> PAGE_SHIFT;
1834 pud = pud_alloc(mm, p4d, addr);
1835 if (!pud)
1836 return -ENOMEM;
1837 do {
1838 next = pud_addr_end(addr, end);
1839 if (remap_pmd_range(mm, pud, addr, next,
1840 pfn + (addr >> PAGE_SHIFT), prot))
1841 return -ENOMEM;
1842 } while (pud++, addr = next, addr != end);
1843 return 0;
1844}
1845
1846static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
1847 unsigned long addr, unsigned long end,
1848 unsigned long pfn, pgprot_t prot)
1849{
1850 p4d_t *p4d;
1851 unsigned long next;
1852
1853 pfn -= addr >> PAGE_SHIFT;
1854 p4d = p4d_alloc(mm, pgd, addr);
1855 if (!p4d)
1856 return -ENOMEM;
1857 do {
1858 next = p4d_addr_end(addr, end);
1859 if (remap_pud_range(mm, p4d, addr, next,
1860 pfn + (addr >> PAGE_SHIFT), prot))
1861 return -ENOMEM;
1862 } while (p4d++, addr = next, addr != end);
1863 return 0;
1864}
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1877 unsigned long pfn, unsigned long size, pgprot_t prot)
1878{
1879 pgd_t *pgd;
1880 unsigned long next;
1881 unsigned long end = addr + PAGE_ALIGN(size);
1882 struct mm_struct *mm = vma->vm_mm;
1883 unsigned long remap_pfn = pfn;
1884 int err;
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904 if (is_cow_mapping(vma->vm_flags)) {
1905 if (addr != vma->vm_start || end != vma->vm_end)
1906 return -EINVAL;
1907 vma->vm_pgoff = pfn;
1908 }
1909
1910 err = track_pfn_remap(vma, &prot, remap_pfn, addr, PAGE_ALIGN(size));
1911 if (err)
1912 return -EINVAL;
1913
1914 vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
1915
1916 BUG_ON(addr >= end);
1917 pfn -= addr >> PAGE_SHIFT;
1918 pgd = pgd_offset(mm, addr);
1919 flush_cache_range(vma, addr, end);
1920 do {
1921 next = pgd_addr_end(addr, end);
1922 err = remap_p4d_range(mm, pgd, addr, next,
1923 pfn + (addr >> PAGE_SHIFT), prot);
1924 if (err)
1925 break;
1926 } while (pgd++, addr = next, addr != end);
1927
1928 if (err)
1929 untrack_pfn(vma, remap_pfn, PAGE_ALIGN(size));
1930
1931 return err;
1932}
1933EXPORT_SYMBOL(remap_pfn_range);
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
1949{
1950 unsigned long vm_len, pfn, pages;
1951
1952
1953 if (start + len < start)
1954 return -EINVAL;
1955
1956
1957
1958
1959
1960 len += start & ~PAGE_MASK;
1961 pfn = start >> PAGE_SHIFT;
1962 pages = (len + ~PAGE_MASK) >> PAGE_SHIFT;
1963 if (pfn + pages < pfn)
1964 return -EINVAL;
1965
1966
1967 if (vma->vm_pgoff > pages)
1968 return -EINVAL;
1969 pfn += vma->vm_pgoff;
1970 pages -= vma->vm_pgoff;
1971
1972
1973 vm_len = vma->vm_end - vma->vm_start;
1974 if (vm_len >> PAGE_SHIFT > pages)
1975 return -EINVAL;
1976
1977
1978 return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
1979}
1980EXPORT_SYMBOL(vm_iomap_memory);
1981
1982static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
1983 unsigned long addr, unsigned long end,
1984 pte_fn_t fn, void *data)
1985{
1986 pte_t *pte;
1987 int err;
1988 pgtable_t token;
1989 spinlock_t *uninitialized_var(ptl);
1990
1991 pte = (mm == &init_mm) ?
1992 pte_alloc_kernel(pmd, addr) :
1993 pte_alloc_map_lock(mm, pmd, addr, &ptl);
1994 if (!pte)
1995 return -ENOMEM;
1996
1997 BUG_ON(pmd_huge(*pmd));
1998
1999 arch_enter_lazy_mmu_mode();
2000
2001 token = pmd_pgtable(*pmd);
2002
2003 do {
2004 err = fn(pte++, token, addr, data);
2005 if (err)
2006 break;
2007 } while (addr += PAGE_SIZE, addr != end);
2008
2009 arch_leave_lazy_mmu_mode();
2010
2011 if (mm != &init_mm)
2012 pte_unmap_unlock(pte-1, ptl);
2013 return err;
2014}
2015
2016static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
2017 unsigned long addr, unsigned long end,
2018 pte_fn_t fn, void *data)
2019{
2020 pmd_t *pmd;
2021 unsigned long next;
2022 int err;
2023
2024 BUG_ON(pud_huge(*pud));
2025
2026 pmd = pmd_alloc(mm, pud, addr);
2027 if (!pmd)
2028 return -ENOMEM;
2029 do {
2030 next = pmd_addr_end(addr, end);
2031 err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
2032 if (err)
2033 break;
2034 } while (pmd++, addr = next, addr != end);
2035 return err;
2036}
2037
2038static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
2039 unsigned long addr, unsigned long end,
2040 pte_fn_t fn, void *data)
2041{
2042 pud_t *pud;
2043 unsigned long next;
2044 int err;
2045
2046 pud = pud_alloc(mm, p4d, addr);
2047 if (!pud)
2048 return -ENOMEM;
2049 do {
2050 next = pud_addr_end(addr, end);
2051 err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
2052 if (err)
2053 break;
2054 } while (pud++, addr = next, addr != end);
2055 return err;
2056}
2057
2058static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
2059 unsigned long addr, unsigned long end,
2060 pte_fn_t fn, void *data)
2061{
2062 p4d_t *p4d;
2063 unsigned long next;
2064 int err;
2065
2066 p4d = p4d_alloc(mm, pgd, addr);
2067 if (!p4d)
2068 return -ENOMEM;
2069 do {
2070 next = p4d_addr_end(addr, end);
2071 err = apply_to_pud_range(mm, p4d, addr, next, fn, data);
2072 if (err)
2073 break;
2074 } while (p4d++, addr = next, addr != end);
2075 return err;
2076}
2077
2078
2079
2080
2081
2082int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
2083 unsigned long size, pte_fn_t fn, void *data)
2084{
2085 pgd_t *pgd;
2086 unsigned long next;
2087 unsigned long end = addr + size;
2088 int err;
2089
2090 if (WARN_ON(addr >= end))
2091 return -EINVAL;
2092
2093 pgd = pgd_offset(mm, addr);
2094 do {
2095 next = pgd_addr_end(addr, end);
2096 err = apply_to_p4d_range(mm, pgd, addr, next, fn, data);
2097 if (err)
2098 break;
2099 } while (pgd++, addr = next, addr != end);
2100
2101 return err;
2102}
2103EXPORT_SYMBOL_GPL(apply_to_page_range);
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
2114 pte_t *page_table, pte_t orig_pte)
2115{
2116 int same = 1;
2117#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
2118 if (sizeof(pte_t) > sizeof(unsigned long)) {
2119 spinlock_t *ptl = pte_lockptr(mm, pmd);
2120 spin_lock(ptl);
2121 same = pte_same(*page_table, orig_pte);
2122 spin_unlock(ptl);
2123 }
2124#endif
2125 pte_unmap(page_table);
2126 return same;
2127}
2128
2129static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
2130{
2131 debug_dma_assert_idle(src);
2132
2133
2134
2135
2136
2137
2138
2139 if (unlikely(!src)) {
2140 void *kaddr = kmap_atomic(dst);
2141 void __user *uaddr = (void __user *)(va & PAGE_MASK);
2142
2143
2144
2145
2146
2147
2148
2149 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
2150 clear_page(kaddr);
2151 kunmap_atomic(kaddr);
2152 flush_dcache_page(dst);
2153 } else
2154 copy_user_highpage(dst, src, va, vma);
2155}
2156
2157static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
2158{
2159 struct file *vm_file = vma->vm_file;
2160
2161 if (vm_file)
2162 return mapping_gfp_mask(vm_file->f_mapping) | __GFP_FS | __GFP_IO;
2163
2164
2165
2166
2167
2168 return GFP_KERNEL;
2169}
2170
2171
2172
2173
2174
2175
2176
2177static int do_page_mkwrite(struct vm_fault *vmf)
2178{
2179 int ret;
2180 struct page *page = vmf->page;
2181 unsigned int old_flags = vmf->flags;
2182
2183 vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
2184
2185 ret = vmf->vma->vm_ops->page_mkwrite(vmf);
2186
2187 vmf->flags = old_flags;
2188 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
2189 return ret;
2190 if (unlikely(!(ret & VM_FAULT_LOCKED))) {
2191 lock_page(page);
2192 if (!page->mapping) {
2193 unlock_page(page);
2194 return 0;
2195 }
2196 ret |= VM_FAULT_LOCKED;
2197 } else
2198 VM_BUG_ON_PAGE(!PageLocked(page), page);
2199 return ret;
2200}
2201
2202
2203
2204
2205
2206
2207static void fault_dirty_shared_page(struct vm_area_struct *vma,
2208 struct page *page)
2209{
2210 struct address_space *mapping;
2211 bool dirtied;
2212 bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;
2213
2214 dirtied = set_page_dirty(page);
2215 VM_BUG_ON_PAGE(PageAnon(page), page);
2216
2217
2218
2219
2220
2221
2222 mapping = page_rmapping(page);
2223 unlock_page(page);
2224
2225 if ((dirtied || page_mkwrite) && mapping) {
2226
2227
2228
2229
2230 balance_dirty_pages_ratelimited(mapping);
2231 }
2232
2233 if (!page_mkwrite)
2234 file_update_time(vma->vm_file);
2235}
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245static inline void wp_page_reuse(struct vm_fault *vmf)
2246 __releases(vmf->ptl)
2247{
2248 struct vm_area_struct *vma = vmf->vma;
2249 struct page *page = vmf->page;
2250 pte_t entry;
2251
2252
2253
2254
2255
2256 if (page)
2257 page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);
2258
2259 flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
2260 entry = pte_mkyoung(vmf->orig_pte);
2261 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2262 if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
2263 update_mmu_cache(vma, vmf->address, vmf->pte);
2264 pte_unmap_unlock(vmf->pte, vmf->ptl);
2265}
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283static int wp_page_copy(struct vm_fault *vmf)
2284{
2285 struct vm_area_struct *vma = vmf->vma;
2286 struct mm_struct *mm = vma->vm_mm;
2287 struct page *old_page = vmf->page;
2288 struct page *new_page = NULL;
2289 pte_t entry;
2290 int page_copied = 0;
2291 const unsigned long mmun_start = vmf->address & PAGE_MASK;
2292 const unsigned long mmun_end = mmun_start + PAGE_SIZE;
2293 struct mem_cgroup *memcg;
2294
2295 if (unlikely(anon_vma_prepare(vma)))
2296 goto oom;
2297
2298 if (is_zero_pfn(pte_pfn(vmf->orig_pte))) {
2299 new_page = alloc_zeroed_user_highpage_movable(vma,
2300 vmf->address);
2301 if (!new_page)
2302 goto oom;
2303 } else {
2304 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
2305 vmf->address);
2306 if (!new_page)
2307 goto oom;
2308 cow_user_page(new_page, old_page, vmf->address, vma);
2309 }
2310
2311 if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false))
2312 goto oom_free_new;
2313
2314 __SetPageUptodate(new_page);
2315
2316 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2317
2318
2319
2320
2321 vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
2322 if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
2323 if (old_page) {
2324 if (!PageAnon(old_page)) {
2325 dec_mm_counter_fast(mm,
2326 mm_counter_file(old_page));
2327 inc_mm_counter_fast(mm, MM_ANONPAGES);
2328 }
2329 } else {
2330 inc_mm_counter_fast(mm, MM_ANONPAGES);
2331 }
2332 flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
2333 entry = mk_pte(new_page, vma->vm_page_prot);
2334 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2335
2336
2337
2338
2339
2340
2341 ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
2342 page_add_new_anon_rmap(new_page, vma, vmf->address, false);
2343 mem_cgroup_commit_charge(new_page, memcg, false, false);
2344 lru_cache_add_active_or_unevictable(new_page, vma);
2345
2346
2347
2348
2349
2350 set_pte_at_notify(mm, vmf->address, vmf->pte, entry);
2351 update_mmu_cache(vma, vmf->address, vmf->pte);
2352 if (old_page) {
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375 page_remove_rmap(old_page, false);
2376 }
2377
2378
2379 new_page = old_page;
2380 page_copied = 1;
2381 } else {
2382 mem_cgroup_cancel_charge(new_page, memcg, false);
2383 }
2384
2385 if (new_page)
2386 put_page(new_page);
2387
2388 pte_unmap_unlock(vmf->pte, vmf->ptl);
2389 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2390 if (old_page) {
2391
2392
2393
2394
2395 if (page_copied && (vma->vm_flags & VM_LOCKED)) {
2396 lock_page(old_page);
2397 if (PageMlocked(old_page))
2398 munlock_vma_page(old_page);
2399 unlock_page(old_page);
2400 }
2401 put_page(old_page);
2402 }
2403 return page_copied ? VM_FAULT_WRITE : 0;
2404oom_free_new:
2405 put_page(new_page);
2406oom:
2407 if (old_page)
2408 put_page(old_page);
2409 return VM_FAULT_OOM;
2410}
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427int finish_mkwrite_fault(struct vm_fault *vmf)
2428{
2429 WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
2430 vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
2431 &vmf->ptl);
2432
2433
2434
2435
2436 if (!pte_same(*vmf->pte, vmf->orig_pte)) {
2437 pte_unmap_unlock(vmf->pte, vmf->ptl);
2438 return VM_FAULT_NOPAGE;
2439 }
2440 wp_page_reuse(vmf);
2441 return 0;
2442}
2443
2444
2445
2446
2447
2448static int wp_pfn_shared(struct vm_fault *vmf)
2449{
2450 struct vm_area_struct *vma = vmf->vma;
2451
2452 if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
2453 int ret;
2454
2455 pte_unmap_unlock(vmf->pte, vmf->ptl);
2456 vmf->flags |= FAULT_FLAG_MKWRITE;
2457 ret = vma->vm_ops->pfn_mkwrite(vmf);
2458 if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
2459 return ret;
2460 return finish_mkwrite_fault(vmf);
2461 }
2462 wp_page_reuse(vmf);
2463 return VM_FAULT_WRITE;
2464}
2465
2466static int wp_page_shared(struct vm_fault *vmf)
2467 __releases(vmf->ptl)
2468{
2469 struct vm_area_struct *vma = vmf->vma;
2470
2471 get_page(vmf->page);
2472
2473 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
2474 int tmp;
2475
2476 pte_unmap_unlock(vmf->pte, vmf->ptl);
2477 tmp = do_page_mkwrite(vmf);
2478 if (unlikely(!tmp || (tmp &
2479 (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
2480 put_page(vmf->page);
2481 return tmp;
2482 }
2483 tmp = finish_mkwrite_fault(vmf);
2484 if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
2485 unlock_page(vmf->page);
2486 put_page(vmf->page);
2487 return tmp;
2488 }
2489 } else {
2490 wp_page_reuse(vmf);
2491 lock_page(vmf->page);
2492 }
2493 fault_dirty_shared_page(vma, vmf->page);
2494 put_page(vmf->page);
2495
2496 return VM_FAULT_WRITE;
2497}
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517static int do_wp_page(struct vm_fault *vmf)
2518 __releases(vmf->ptl)
2519{
2520 struct vm_area_struct *vma = vmf->vma;
2521
2522 vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
2523 if (!vmf->page) {
2524
2525
2526
2527
2528
2529
2530
2531 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2532 (VM_WRITE|VM_SHARED))
2533 return wp_pfn_shared(vmf);
2534
2535 pte_unmap_unlock(vmf->pte, vmf->ptl);
2536 return wp_page_copy(vmf);
2537 }
2538
2539
2540
2541
2542
2543 if (PageAnon(vmf->page) && !PageKsm(vmf->page)) {
2544 int total_mapcount;
2545 if (!trylock_page(vmf->page)) {
2546 get_page(vmf->page);
2547 pte_unmap_unlock(vmf->pte, vmf->ptl);
2548 lock_page(vmf->page);
2549 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
2550 vmf->address, &vmf->ptl);
2551 if (!pte_same(*vmf->pte, vmf->orig_pte)) {
2552 unlock_page(vmf->page);
2553 pte_unmap_unlock(vmf->pte, vmf->ptl);
2554 put_page(vmf->page);
2555 return 0;
2556 }
2557 put_page(vmf->page);
2558 }
2559 if (reuse_swap_page(vmf->page, &total_mapcount)) {
2560 if (total_mapcount == 1) {
2561
2562
2563
2564
2565
2566
2567
2568 page_move_anon_rmap(vmf->page, vma);
2569 }
2570 unlock_page(vmf->page);
2571 wp_page_reuse(vmf);
2572 return VM_FAULT_WRITE;
2573 }
2574 unlock_page(vmf->page);
2575 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2576 (VM_WRITE|VM_SHARED))) {
2577 return wp_page_shared(vmf);
2578 }
2579
2580
2581
2582
2583 get_page(vmf->page);
2584
2585 pte_unmap_unlock(vmf->pte, vmf->ptl);
2586 return wp_page_copy(vmf);
2587}
2588
2589static void unmap_mapping_range_vma(struct vm_area_struct *vma,
2590 unsigned long start_addr, unsigned long end_addr,
2591 struct zap_details *details)
2592{
2593 zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
2594}
2595
2596static inline void unmap_mapping_range_tree(struct rb_root *root,
2597 struct zap_details *details)
2598{
2599 struct vm_area_struct *vma;
2600 pgoff_t vba, vea, zba, zea;
2601
2602 vma_interval_tree_foreach(vma, root,
2603 details->first_index, details->last_index) {
2604
2605 vba = vma->vm_pgoff;
2606 vea = vba + vma_pages(vma) - 1;
2607 zba = details->first_index;
2608 if (zba < vba)
2609 zba = vba;
2610 zea = details->last_index;
2611 if (zea > vea)
2612 zea = vea;
2613
2614 unmap_mapping_range_vma(vma,
2615 ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
2616 ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
2617 details);
2618 }
2619}
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638void unmap_mapping_range(struct address_space *mapping,
2639 loff_t const holebegin, loff_t const holelen, int even_cows)
2640{
2641 struct zap_details details = { };
2642 pgoff_t hba = holebegin >> PAGE_SHIFT;
2643 pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2644
2645
2646 if (sizeof(holelen) > sizeof(hlen)) {
2647 long long holeend =
2648 (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2649 if (holeend & ~(long long)ULONG_MAX)
2650 hlen = ULONG_MAX - hba + 1;
2651 }
2652
2653 details.check_mapping = even_cows ? NULL : mapping;
2654 details.first_index = hba;
2655 details.last_index = hba + hlen - 1;
2656 if (details.last_index < details.first_index)
2657 details.last_index = ULONG_MAX;
2658
2659 i_mmap_lock_write(mapping);
2660 if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
2661 unmap_mapping_range_tree(&mapping->i_mmap, &details);
2662 i_mmap_unlock_write(mapping);
2663}
2664EXPORT_SYMBOL(unmap_mapping_range);
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674int do_swap_page(struct vm_fault *vmf)
2675{
2676 struct vm_area_struct *vma = vmf->vma;
2677 struct page *page, *swapcache;
2678 struct mem_cgroup *memcg;
2679 swp_entry_t entry;
2680 pte_t pte;
2681 int locked;
2682 int exclusive = 0;
2683 int ret = 0;
2684
2685 if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
2686 goto out;
2687
2688 entry = pte_to_swp_entry(vmf->orig_pte);
2689 if (unlikely(non_swap_entry(entry))) {
2690 if (is_migration_entry(entry)) {
2691 migration_entry_wait(vma->vm_mm, vmf->pmd,
2692 vmf->address);
2693 } else if (is_hwpoison_entry(entry)) {
2694 ret = VM_FAULT_HWPOISON;
2695 } else {
2696 print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
2697 ret = VM_FAULT_SIGBUS;
2698 }
2699 goto out;
2700 }
2701 delayacct_set_flag(DELAYACCT_PF_SWAPIN);
2702 page = lookup_swap_cache(entry);
2703 if (!page) {
2704 page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vma,
2705 vmf->address);
2706 if (!page) {
2707
2708
2709
2710
2711 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
2712 vmf->address, &vmf->ptl);
2713 if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
2714 ret = VM_FAULT_OOM;
2715 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2716 goto unlock;
2717 }
2718
2719
2720 ret = VM_FAULT_MAJOR;
2721 count_vm_event(PGMAJFAULT);
2722 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
2723 } else if (PageHWPoison(page)) {
2724
2725
2726
2727
2728 ret = VM_FAULT_HWPOISON;
2729 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2730 swapcache = page;
2731 goto out_release;
2732 }
2733
2734 swapcache = page;
2735 locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
2736
2737 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2738 if (!locked) {
2739 ret |= VM_FAULT_RETRY;
2740 goto out_release;
2741 }
2742
2743
2744
2745
2746
2747
2748
2749 if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
2750 goto out_page;
2751
2752 page = ksm_might_need_to_copy(page, vma, vmf->address);
2753 if (unlikely(!page)) {
2754 ret = VM_FAULT_OOM;
2755 page = swapcache;
2756 goto out_page;
2757 }
2758
2759 if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL,
2760 &memcg, false)) {
2761 ret = VM_FAULT_OOM;
2762 goto out_page;
2763 }
2764
2765
2766
2767
2768 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
2769 &vmf->ptl);
2770 if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte)))
2771 goto out_nomap;
2772
2773 if (unlikely(!PageUptodate(page))) {
2774 ret = VM_FAULT_SIGBUS;
2775 goto out_nomap;
2776 }
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
2789 dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
2790 pte = mk_pte(page, vma->vm_page_prot);
2791 if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
2792 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
2793 vmf->flags &= ~FAULT_FLAG_WRITE;
2794 ret |= VM_FAULT_WRITE;
2795 exclusive = RMAP_EXCLUSIVE;
2796 }
2797 flush_icache_page(vma, page);
2798 if (pte_swp_soft_dirty(vmf->orig_pte))
2799 pte = pte_mksoft_dirty(pte);
2800 set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
2801 vmf->orig_pte = pte;
2802 if (page == swapcache) {
2803 do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
2804 mem_cgroup_commit_charge(page, memcg, true, false);
2805 activate_page(page);
2806 } else {
2807 page_add_new_anon_rmap(page, vma, vmf->address, false);
2808 mem_cgroup_commit_charge(page, memcg, false, false);
2809 lru_cache_add_active_or_unevictable(page, vma);
2810 }
2811
2812 swap_free(entry);
2813 if (mem_cgroup_swap_full(page) ||
2814 (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
2815 try_to_free_swap(page);
2816 unlock_page(page);
2817 if (page != swapcache) {
2818
2819
2820
2821
2822
2823
2824
2825
2826 unlock_page(swapcache);
2827 put_page(swapcache);
2828 }
2829
2830 if (vmf->flags & FAULT_FLAG_WRITE) {
2831 ret |= do_wp_page(vmf);
2832 if (ret & VM_FAULT_ERROR)
2833 ret &= VM_FAULT_ERROR;
2834 goto out;
2835 }
2836
2837
2838 update_mmu_cache(vma, vmf->address, vmf->pte);
2839unlock:
2840 pte_unmap_unlock(vmf->pte, vmf->ptl);
2841out:
2842 return ret;
2843out_nomap:
2844 mem_cgroup_cancel_charge(page, memcg, false);
2845 pte_unmap_unlock(vmf->pte, vmf->ptl);
2846out_page:
2847 unlock_page(page);
2848out_release:
2849 put_page(page);
2850 if (page != swapcache) {
2851 unlock_page(swapcache);
2852 put_page(swapcache);
2853 }
2854 return ret;
2855}
2856
2857
2858
2859
2860
2861
2862static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address)
2863{
2864 address &= PAGE_MASK;
2865 if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) {
2866 struct vm_area_struct *prev = vma->vm_prev;
2867
2868
2869
2870
2871
2872
2873
2874 if (prev && prev->vm_end == address)
2875 return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM;
2876
2877 return expand_downwards(vma, address - PAGE_SIZE);
2878 }
2879 if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) {
2880 struct vm_area_struct *next = vma->vm_next;
2881
2882
2883 if (next && next->vm_start == address + PAGE_SIZE)
2884 return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM;
2885
2886 return expand_upwards(vma, address + PAGE_SIZE);
2887 }
2888 return 0;
2889}
2890
2891
2892
2893
2894
2895
2896static int do_anonymous_page(struct vm_fault *vmf)
2897{
2898 struct vm_area_struct *vma = vmf->vma;
2899 struct mem_cgroup *memcg;
2900 struct page *page;
2901 pte_t entry;
2902
2903
2904 if (vma->vm_flags & VM_SHARED)
2905 return VM_FAULT_SIGBUS;
2906
2907
2908 if (check_stack_guard_page(vma, vmf->address) < 0)
2909 return VM_FAULT_SIGSEGV;
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921 if (pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))
2922 return VM_FAULT_OOM;
2923
2924
2925 if (unlikely(pmd_trans_unstable(vmf->pmd)))
2926 return 0;
2927
2928
2929 if (!(vmf->flags & FAULT_FLAG_WRITE) &&
2930 !mm_forbids_zeropage(vma->vm_mm)) {
2931 entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address),
2932 vma->vm_page_prot));
2933 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
2934 vmf->address, &vmf->ptl);
2935 if (!pte_none(*vmf->pte))
2936 goto unlock;
2937
2938 if (userfaultfd_missing(vma)) {
2939 pte_unmap_unlock(vmf->pte, vmf->ptl);
2940 return handle_userfault(vmf, VM_UFFD_MISSING);
2941 }
2942 goto setpte;
2943 }
2944
2945
2946 if (unlikely(anon_vma_prepare(vma)))
2947 goto oom;
2948 page = alloc_zeroed_user_highpage_movable(vma, vmf->address);
2949 if (!page)
2950 goto oom;
2951
2952 if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false))
2953 goto oom_free_page;
2954
2955
2956
2957
2958
2959
2960 __SetPageUptodate(page);
2961
2962 entry = mk_pte(page, vma->vm_page_prot);
2963 if (vma->vm_flags & VM_WRITE)
2964 entry = pte_mkwrite(pte_mkdirty(entry));
2965
2966 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
2967 &vmf->ptl);
2968 if (!pte_none(*vmf->pte))
2969 goto release;
2970
2971
2972 if (userfaultfd_missing(vma)) {
2973 pte_unmap_unlock(vmf->pte, vmf->ptl);
2974 mem_cgroup_cancel_charge(page, memcg, false);
2975 put_page(page);
2976 return handle_userfault(vmf, VM_UFFD_MISSING);
2977 }
2978
2979 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
2980 page_add_new_anon_rmap(page, vma, vmf->address, false);
2981 mem_cgroup_commit_charge(page, memcg, false, false);
2982 lru_cache_add_active_or_unevictable(page, vma);
2983setpte:
2984 set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
2985
2986
2987 update_mmu_cache(vma, vmf->address, vmf->pte);
2988unlock:
2989 pte_unmap_unlock(vmf->pte, vmf->ptl);
2990 return 0;
2991release:
2992 mem_cgroup_cancel_charge(page, memcg, false);
2993 put_page(page);
2994 goto unlock;
2995oom_free_page:
2996 put_page(page);
2997oom:
2998 return VM_FAULT_OOM;
2999}
3000
3001
3002
3003
3004
3005
3006static int __do_fault(struct vm_fault *vmf)
3007{
3008 struct vm_area_struct *vma = vmf->vma;
3009 int ret;
3010
3011 ret = vma->vm_ops->fault(vmf);
3012 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
3013 VM_FAULT_DONE_COW)))
3014 return ret;
3015
3016 if (unlikely(PageHWPoison(vmf->page))) {
3017 if (ret & VM_FAULT_LOCKED)
3018 unlock_page(vmf->page);
3019 put_page(vmf->page);
3020 vmf->page = NULL;
3021 return VM_FAULT_HWPOISON;
3022 }
3023
3024 if (unlikely(!(ret & VM_FAULT_LOCKED)))
3025 lock_page(vmf->page);
3026 else
3027 VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page);
3028
3029 return ret;
3030}
3031
3032static int pte_alloc_one_map(struct vm_fault *vmf)
3033{
3034 struct vm_area_struct *vma = vmf->vma;
3035
3036 if (!pmd_none(*vmf->pmd))
3037 goto map_pte;
3038 if (vmf->prealloc_pte) {
3039 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
3040 if (unlikely(!pmd_none(*vmf->pmd))) {
3041 spin_unlock(vmf->ptl);
3042 goto map_pte;
3043 }
3044
3045 atomic_long_inc(&vma->vm_mm->nr_ptes);
3046 pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
3047 spin_unlock(vmf->ptl);
3048 vmf->prealloc_pte = NULL;
3049 } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))) {
3050 return VM_FAULT_OOM;
3051 }
3052map_pte:
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064 if (pmd_trans_unstable(vmf->pmd) || pmd_devmap(*vmf->pmd))
3065 return VM_FAULT_NOPAGE;
3066
3067 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
3068 &vmf->ptl);
3069 return 0;
3070}
3071
3072#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
3073
3074#define HPAGE_CACHE_INDEX_MASK (HPAGE_PMD_NR - 1)
3075static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
3076 unsigned long haddr)
3077{
3078 if (((vma->vm_start >> PAGE_SHIFT) & HPAGE_CACHE_INDEX_MASK) !=
3079 (vma->vm_pgoff & HPAGE_CACHE_INDEX_MASK))
3080 return false;
3081 if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
3082 return false;
3083 return true;
3084}
3085
3086static void deposit_prealloc_pte(struct vm_fault *vmf)
3087{
3088 struct vm_area_struct *vma = vmf->vma;
3089
3090 pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
3091
3092
3093
3094
3095 atomic_long_inc(&vma->vm_mm->nr_ptes);
3096 vmf->prealloc_pte = NULL;
3097}
3098
3099static int do_set_pmd(struct vm_fault *vmf, struct page *page)
3100{
3101 struct vm_area_struct *vma = vmf->vma;
3102 bool write = vmf->flags & FAULT_FLAG_WRITE;
3103 unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
3104 pmd_t entry;
3105 int i, ret;
3106
3107 if (!transhuge_vma_suitable(vma, haddr))
3108 return VM_FAULT_FALLBACK;
3109
3110 ret = VM_FAULT_FALLBACK;
3111 page = compound_head(page);
3112
3113
3114
3115
3116
3117 if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
3118 vmf->prealloc_pte = pte_alloc_one(vma->vm_mm, vmf->address);
3119 if (!vmf->prealloc_pte)
3120 return VM_FAULT_OOM;
3121 smp_wmb();
3122 }
3123
3124 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
3125 if (unlikely(!pmd_none(*vmf->pmd)))
3126 goto out;
3127
3128 for (i = 0; i < HPAGE_PMD_NR; i++)
3129 flush_icache_page(vma, page + i);
3130
3131 entry = mk_huge_pmd(page, vma->vm_page_prot);
3132 if (write)
3133 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
3134
3135 add_mm_counter(vma->vm_mm, MM_FILEPAGES, HPAGE_PMD_NR);
3136 page_add_file_rmap(page, true);
3137
3138
3139
3140 if (arch_needs_pgtable_deposit())
3141 deposit_prealloc_pte(vmf);
3142
3143 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
3144
3145 update_mmu_cache_pmd(vma, haddr, vmf->pmd);
3146
3147
3148 ret = 0;
3149 count_vm_event(THP_FILE_MAPPED);
3150out:
3151 spin_unlock(vmf->ptl);
3152 return ret;
3153}
3154#else
3155static int do_set_pmd(struct vm_fault *vmf, struct page *page)
3156{
3157 BUILD_BUG();
3158 return 0;
3159}
3160#endif
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
3177 struct page *page)
3178{
3179 struct vm_area_struct *vma = vmf->vma;
3180 bool write = vmf->flags & FAULT_FLAG_WRITE;
3181 pte_t entry;
3182 int ret;
3183
3184 if (pmd_none(*vmf->pmd) && PageTransCompound(page) &&
3185 IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) {
3186
3187 VM_BUG_ON_PAGE(memcg, page);
3188
3189 ret = do_set_pmd(vmf, page);
3190 if (ret != VM_FAULT_FALLBACK)
3191 return ret;
3192 }
3193
3194 if (!vmf->pte) {
3195 ret = pte_alloc_one_map(vmf);
3196 if (ret)
3197 return ret;
3198 }
3199
3200
3201 if (unlikely(!pte_none(*vmf->pte)))
3202 return VM_FAULT_NOPAGE;
3203
3204 flush_icache_page(vma, page);
3205 entry = mk_pte(page, vma->vm_page_prot);
3206 if (write)
3207 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
3208
3209 if (write && !(vma->vm_flags & VM_SHARED)) {
3210 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
3211 page_add_new_anon_rmap(page, vma, vmf->address, false);
3212 mem_cgroup_commit_charge(page, memcg, false, false);
3213 lru_cache_add_active_or_unevictable(page, vma);
3214 } else {
3215 inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
3216 page_add_file_rmap(page, false);
3217 }
3218 set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
3219
3220
3221 update_mmu_cache(vma, vmf->address, vmf->pte);
3222
3223 return 0;
3224}
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241int finish_fault(struct vm_fault *vmf)
3242{
3243 struct page *page;
3244 int ret;
3245
3246
3247 if ((vmf->flags & FAULT_FLAG_WRITE) &&
3248 !(vmf->vma->vm_flags & VM_SHARED))
3249 page = vmf->cow_page;
3250 else
3251 page = vmf->page;
3252 ret = alloc_set_pte(vmf, vmf->memcg, page);
3253 if (vmf->pte)
3254 pte_unmap_unlock(vmf->pte, vmf->ptl);
3255 return ret;
3256}
3257
3258static unsigned long fault_around_bytes __read_mostly =
3259 rounddown_pow_of_two(65536);
3260
3261#ifdef CONFIG_DEBUG_FS
3262static int fault_around_bytes_get(void *data, u64 *val)
3263{
3264 *val = fault_around_bytes;
3265 return 0;
3266}
3267
3268
3269
3270
3271
3272
3273static int fault_around_bytes_set(void *data, u64 val)
3274{
3275 if (val / PAGE_SIZE > PTRS_PER_PTE)
3276 return -EINVAL;
3277 if (val > PAGE_SIZE)
3278 fault_around_bytes = rounddown_pow_of_two(val);
3279 else
3280 fault_around_bytes = PAGE_SIZE;
3281 return 0;
3282}
3283DEFINE_SIMPLE_ATTRIBUTE(fault_around_bytes_fops,
3284 fault_around_bytes_get, fault_around_bytes_set, "%llu\n");
3285
3286static int __init fault_around_debugfs(void)
3287{
3288 void *ret;
3289
3290 ret = debugfs_create_file("fault_around_bytes", 0644, NULL, NULL,
3291 &fault_around_bytes_fops);
3292 if (!ret)
3293 pr_warn("Failed to create fault_around_bytes in debugfs");
3294 return 0;
3295}
3296late_initcall(fault_around_debugfs);
3297#endif
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322static int do_fault_around(struct vm_fault *vmf)
3323{
3324 unsigned long address = vmf->address, nr_pages, mask;
3325 pgoff_t start_pgoff = vmf->pgoff;
3326 pgoff_t end_pgoff;
3327 int off, ret = 0;
3328
3329 nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
3330 mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
3331
3332 vmf->address = max(address & mask, vmf->vma->vm_start);
3333 off = ((address - vmf->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
3334 start_pgoff -= off;
3335
3336
3337
3338
3339
3340 end_pgoff = start_pgoff -
3341 ((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
3342 PTRS_PER_PTE - 1;
3343 end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1,
3344 start_pgoff + nr_pages - 1);
3345
3346 if (pmd_none(*vmf->pmd)) {
3347 vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm,
3348 vmf->address);
3349 if (!vmf->prealloc_pte)
3350 goto out;
3351 smp_wmb();
3352 }
3353
3354 vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);
3355
3356
3357 if (pmd_trans_huge(*vmf->pmd)) {
3358 ret = VM_FAULT_NOPAGE;
3359 goto out;
3360 }
3361
3362
3363 if (!vmf->pte)
3364 goto out;
3365
3366
3367 vmf->pte -= (vmf->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT);
3368 if (!pte_none(*vmf->pte))
3369 ret = VM_FAULT_NOPAGE;
3370 pte_unmap_unlock(vmf->pte, vmf->ptl);
3371out:
3372 vmf->address = address;
3373 vmf->pte = NULL;
3374 return ret;
3375}
3376
3377static int do_read_fault(struct vm_fault *vmf)
3378{
3379 struct vm_area_struct *vma = vmf->vma;
3380 int ret = 0;
3381
3382
3383
3384
3385
3386
3387 if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
3388 ret = do_fault_around(vmf);
3389 if (ret)
3390 return ret;
3391 }
3392
3393 ret = __do_fault(vmf);
3394 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3395 return ret;
3396
3397 ret |= finish_fault(vmf);
3398 unlock_page(vmf->page);
3399 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3400 put_page(vmf->page);
3401 return ret;
3402}
3403
3404static int do_cow_fault(struct vm_fault *vmf)
3405{
3406 struct vm_area_struct *vma = vmf->vma;
3407 int ret;
3408
3409 if (unlikely(anon_vma_prepare(vma)))
3410 return VM_FAULT_OOM;
3411
3412 vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address);
3413 if (!vmf->cow_page)
3414 return VM_FAULT_OOM;
3415
3416 if (mem_cgroup_try_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL,
3417 &vmf->memcg, false)) {
3418 put_page(vmf->cow_page);
3419 return VM_FAULT_OOM;
3420 }
3421
3422 ret = __do_fault(vmf);
3423 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3424 goto uncharge_out;
3425 if (ret & VM_FAULT_DONE_COW)
3426 return ret;
3427
3428 copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma);
3429 __SetPageUptodate(vmf->cow_page);
3430
3431 ret |= finish_fault(vmf);
3432 unlock_page(vmf->page);
3433 put_page(vmf->page);
3434 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3435 goto uncharge_out;
3436 return ret;
3437uncharge_out:
3438 mem_cgroup_cancel_charge(vmf->cow_page, vmf->memcg, false);
3439 put_page(vmf->cow_page);
3440 return ret;
3441}
3442
3443static int do_shared_fault(struct vm_fault *vmf)
3444{
3445 struct vm_area_struct *vma = vmf->vma;
3446 int ret, tmp;
3447
3448 ret = __do_fault(vmf);
3449 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3450 return ret;
3451
3452
3453
3454
3455
3456 if (vma->vm_ops->page_mkwrite) {
3457 unlock_page(vmf->page);
3458 tmp = do_page_mkwrite(vmf);
3459 if (unlikely(!tmp ||
3460 (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
3461 put_page(vmf->page);
3462 return tmp;
3463 }
3464 }
3465
3466 ret |= finish_fault(vmf);
3467 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
3468 VM_FAULT_RETRY))) {
3469 unlock_page(vmf->page);
3470 put_page(vmf->page);
3471 return ret;
3472 }
3473
3474 fault_dirty_shared_page(vma, vmf->page);
3475 return ret;
3476}
3477
3478
3479
3480
3481
3482
3483
3484static int do_fault(struct vm_fault *vmf)
3485{
3486 struct vm_area_struct *vma = vmf->vma;
3487 int ret;
3488
3489
3490 if (!vma->vm_ops->fault)
3491 ret = VM_FAULT_SIGBUS;
3492 else if (!(vmf->flags & FAULT_FLAG_WRITE))
3493 ret = do_read_fault(vmf);
3494 else if (!(vma->vm_flags & VM_SHARED))
3495 ret = do_cow_fault(vmf);
3496 else
3497 ret = do_shared_fault(vmf);
3498
3499
3500 if (vmf->prealloc_pte) {
3501 pte_free(vma->vm_mm, vmf->prealloc_pte);
3502 vmf->prealloc_pte = NULL;
3503 }
3504 return ret;
3505}
3506
3507static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
3508 unsigned long addr, int page_nid,
3509 int *flags)
3510{
3511 get_page(page);
3512
3513 count_vm_numa_event(NUMA_HINT_FAULTS);
3514 if (page_nid == numa_node_id()) {
3515 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
3516 *flags |= TNF_FAULT_LOCAL;
3517 }
3518
3519 return mpol_misplaced(page, vma, addr);
3520}
3521
3522static int do_numa_page(struct vm_fault *vmf)
3523{
3524 struct vm_area_struct *vma = vmf->vma;
3525 struct page *page = NULL;
3526 int page_nid = -1;
3527 int last_cpupid;
3528 int target_nid;
3529 bool migrated = false;
3530 pte_t pte;
3531 bool was_writable = pte_savedwrite(vmf->orig_pte);
3532 int flags = 0;
3533
3534
3535
3536
3537
3538
3539 vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd);
3540 spin_lock(vmf->ptl);
3541 if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
3542 pte_unmap_unlock(vmf->pte, vmf->ptl);
3543 goto out;
3544 }
3545
3546
3547
3548
3549
3550 pte = ptep_modify_prot_start(vma->vm_mm, vmf->address, vmf->pte);
3551 pte = pte_modify(pte, vma->vm_page_prot);
3552 pte = pte_mkyoung(pte);
3553 if (was_writable)
3554 pte = pte_mkwrite(pte);
3555 ptep_modify_prot_commit(vma->vm_mm, vmf->address, vmf->pte, pte);
3556 update_mmu_cache(vma, vmf->address, vmf->pte);
3557
3558 page = vm_normal_page(vma, vmf->address, pte);
3559 if (!page) {
3560 pte_unmap_unlock(vmf->pte, vmf->ptl);
3561 return 0;
3562 }
3563
3564
3565 if (PageCompound(page)) {
3566 pte_unmap_unlock(vmf->pte, vmf->ptl);
3567 return 0;
3568 }
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578 if (!pte_write(pte))
3579 flags |= TNF_NO_GROUP;
3580
3581
3582
3583
3584
3585 if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
3586 flags |= TNF_SHARED;
3587
3588 last_cpupid = page_cpupid_last(page);
3589 page_nid = page_to_nid(page);
3590 target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
3591 &flags);
3592 pte_unmap_unlock(vmf->pte, vmf->ptl);
3593 if (target_nid == -1) {
3594 put_page(page);
3595 goto out;
3596 }
3597
3598
3599 migrated = migrate_misplaced_page(page, vma, target_nid);
3600 if (migrated) {
3601 page_nid = target_nid;
3602 flags |= TNF_MIGRATED;
3603 } else
3604 flags |= TNF_MIGRATE_FAIL;
3605
3606out:
3607 if (page_nid != -1)
3608 task_numa_fault(last_cpupid, page_nid, 1, flags);
3609 return 0;
3610}
3611
3612static int create_huge_pmd(struct vm_fault *vmf)
3613{
3614 if (vma_is_anonymous(vmf->vma))
3615 return do_huge_pmd_anonymous_page(vmf);
3616 if (vmf->vma->vm_ops->huge_fault)
3617 return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
3618 return VM_FAULT_FALLBACK;
3619}
3620
3621static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
3622{
3623 if (vma_is_anonymous(vmf->vma))
3624 return do_huge_pmd_wp_page(vmf, orig_pmd);
3625 if (vmf->vma->vm_ops->huge_fault)
3626 return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
3627
3628
3629 VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);
3630 __split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);
3631
3632 return VM_FAULT_FALLBACK;
3633}
3634
3635static inline bool vma_is_accessible(struct vm_area_struct *vma)
3636{
3637 return vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE);
3638}
3639
3640static int create_huge_pud(struct vm_fault *vmf)
3641{
3642#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3643
3644 if (vma_is_anonymous(vmf->vma))
3645 return VM_FAULT_FALLBACK;
3646 if (vmf->vma->vm_ops->huge_fault)
3647 return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
3648#endif
3649 return VM_FAULT_FALLBACK;
3650}
3651
3652static int wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
3653{
3654#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3655
3656 if (vma_is_anonymous(vmf->vma))
3657 return VM_FAULT_FALLBACK;
3658 if (vmf->vma->vm_ops->huge_fault)
3659 return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
3660#endif
3661 return VM_FAULT_FALLBACK;
3662}
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679static int handle_pte_fault(struct vm_fault *vmf)
3680{
3681 pte_t entry;
3682
3683 if (unlikely(pmd_none(*vmf->pmd))) {
3684
3685
3686
3687
3688
3689
3690 vmf->pte = NULL;
3691 } else {
3692
3693 if (pmd_trans_unstable(vmf->pmd) || pmd_devmap(*vmf->pmd))
3694 return 0;
3695
3696
3697
3698
3699
3700
3701 vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
3702 vmf->orig_pte = *vmf->pte;
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712 barrier();
3713 if (pte_none(vmf->orig_pte)) {
3714 pte_unmap(vmf->pte);
3715 vmf->pte = NULL;
3716 }
3717 }
3718
3719 if (!vmf->pte) {
3720 if (vma_is_anonymous(vmf->vma))
3721 return do_anonymous_page(vmf);
3722 else
3723 return do_fault(vmf);
3724 }
3725
3726 if (!pte_present(vmf->orig_pte))
3727 return do_swap_page(vmf);
3728
3729 if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
3730 return do_numa_page(vmf);
3731
3732 vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
3733 spin_lock(vmf->ptl);
3734 entry = vmf->orig_pte;
3735 if (unlikely(!pte_same(*vmf->pte, entry)))
3736 goto unlock;
3737 if (vmf->flags & FAULT_FLAG_WRITE) {
3738 if (!pte_write(entry))
3739 return do_wp_page(vmf);
3740 entry = pte_mkdirty(entry);
3741 }
3742 entry = pte_mkyoung(entry);
3743 if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
3744 vmf->flags & FAULT_FLAG_WRITE)) {
3745 update_mmu_cache(vmf->vma, vmf->address, vmf->pte);
3746 } else {
3747
3748
3749
3750
3751
3752
3753 if (vmf->flags & FAULT_FLAG_WRITE)
3754 flush_tlb_fix_spurious_fault(vmf->vma, vmf->address);
3755 }
3756unlock:
3757 pte_unmap_unlock(vmf->pte, vmf->ptl);
3758 return 0;
3759}
3760
3761
3762
3763
3764
3765
3766
3767static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
3768 unsigned int flags)
3769{
3770 struct vm_fault vmf = {
3771 .vma = vma,
3772 .address = address & PAGE_MASK,
3773 .flags = flags,
3774 .pgoff = linear_page_index(vma, address),
3775 .gfp_mask = __get_fault_gfp_mask(vma),
3776 };
3777 struct mm_struct *mm = vma->vm_mm;
3778 pgd_t *pgd;
3779 p4d_t *p4d;
3780 int ret;
3781
3782 pgd = pgd_offset(mm, address);
3783 p4d = p4d_alloc(mm, pgd, address);
3784 if (!p4d)
3785 return VM_FAULT_OOM;
3786
3787 vmf.pud = pud_alloc(mm, p4d, address);
3788 if (!vmf.pud)
3789 return VM_FAULT_OOM;
3790 if (pud_none(*vmf.pud) && transparent_hugepage_enabled(vma)) {
3791 ret = create_huge_pud(&vmf);
3792 if (!(ret & VM_FAULT_FALLBACK))
3793 return ret;
3794 } else {
3795 pud_t orig_pud = *vmf.pud;
3796
3797 barrier();
3798 if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {
3799 unsigned int dirty = flags & FAULT_FLAG_WRITE;
3800
3801
3802
3803 if (dirty && !pud_write(orig_pud)) {
3804 ret = wp_huge_pud(&vmf, orig_pud);
3805 if (!(ret & VM_FAULT_FALLBACK))
3806 return ret;
3807 } else {
3808 huge_pud_set_accessed(&vmf, orig_pud);
3809 return 0;
3810 }
3811 }
3812 }
3813
3814 vmf.pmd = pmd_alloc(mm, vmf.pud, address);
3815 if (!vmf.pmd)
3816 return VM_FAULT_OOM;
3817 if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) {
3818 ret = create_huge_pmd(&vmf);
3819 if (!(ret & VM_FAULT_FALLBACK))
3820 return ret;
3821 } else {
3822 pmd_t orig_pmd = *vmf.pmd;
3823
3824 barrier();
3825 if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
3826 if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
3827 return do_huge_pmd_numa_page(&vmf, orig_pmd);
3828
3829 if ((vmf.flags & FAULT_FLAG_WRITE) &&
3830 !pmd_write(orig_pmd)) {
3831 ret = wp_huge_pmd(&vmf, orig_pmd);
3832 if (!(ret & VM_FAULT_FALLBACK))
3833 return ret;
3834 } else {
3835 huge_pmd_set_accessed(&vmf, orig_pmd);
3836 return 0;
3837 }
3838 }
3839 }
3840
3841 return handle_pte_fault(&vmf);
3842}
3843
3844
3845
3846
3847
3848
3849
3850int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
3851 unsigned int flags)
3852{
3853 int ret;
3854
3855 __set_current_state(TASK_RUNNING);
3856
3857 count_vm_event(PGFAULT);
3858 mem_cgroup_count_vm_event(vma->vm_mm, PGFAULT);
3859
3860
3861 check_sync_rss_stat(current);
3862
3863
3864
3865
3866
3867 if (flags & FAULT_FLAG_USER)
3868 mem_cgroup_oom_enable();
3869
3870 if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
3871 flags & FAULT_FLAG_INSTRUCTION,
3872 flags & FAULT_FLAG_REMOTE))
3873 return VM_FAULT_SIGSEGV;
3874
3875 if (unlikely(is_vm_hugetlb_page(vma)))
3876 ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
3877 else
3878 ret = __handle_mm_fault(vma, address, flags);
3879
3880 if (flags & FAULT_FLAG_USER) {
3881 mem_cgroup_oom_disable();
3882
3883
3884
3885
3886
3887
3888 if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
3889 mem_cgroup_oom_synchronize(false);
3890 }
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901 if (unlikely((current->flags & PF_KTHREAD) && !(ret & VM_FAULT_ERROR)
3902 && test_bit(MMF_UNSTABLE, &vma->vm_mm->flags)))
3903 ret = VM_FAULT_SIGBUS;
3904
3905 return ret;
3906}
3907EXPORT_SYMBOL_GPL(handle_mm_fault);
3908
3909#ifndef __PAGETABLE_P4D_FOLDED
3910
3911
3912
3913
3914int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
3915{
3916 p4d_t *new = p4d_alloc_one(mm, address);
3917 if (!new)
3918 return -ENOMEM;
3919
3920 smp_wmb();
3921
3922 spin_lock(&mm->page_table_lock);
3923 if (pgd_present(*pgd))
3924 p4d_free(mm, new);
3925 else
3926 pgd_populate(mm, pgd, new);
3927 spin_unlock(&mm->page_table_lock);
3928 return 0;
3929}
3930#endif
3931
3932#ifndef __PAGETABLE_PUD_FOLDED
3933
3934
3935
3936
3937int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address)
3938{
3939 pud_t *new = pud_alloc_one(mm, address);
3940 if (!new)
3941 return -ENOMEM;
3942
3943 smp_wmb();
3944
3945 spin_lock(&mm->page_table_lock);
3946#ifndef __ARCH_HAS_5LEVEL_HACK
3947 if (p4d_present(*p4d))
3948 pud_free(mm, new);
3949 else
3950 p4d_populate(mm, p4d, new);
3951#else
3952 if (pgd_present(*p4d))
3953 pud_free(mm, new);
3954 else
3955 pgd_populate(mm, p4d, new);
3956#endif
3957 spin_unlock(&mm->page_table_lock);
3958 return 0;
3959}
3960#endif
3961
3962#ifndef __PAGETABLE_PMD_FOLDED
3963
3964
3965
3966
3967int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
3968{
3969 spinlock_t *ptl;
3970 pmd_t *new = pmd_alloc_one(mm, address);
3971 if (!new)
3972 return -ENOMEM;
3973
3974 smp_wmb();
3975
3976 ptl = pud_lock(mm, pud);
3977#ifndef __ARCH_HAS_4LEVEL_HACK
3978 if (!pud_present(*pud)) {
3979 mm_inc_nr_pmds(mm);
3980 pud_populate(mm, pud, new);
3981 } else
3982 pmd_free(mm, new);
3983#else
3984 if (!pgd_present(*pud)) {
3985 mm_inc_nr_pmds(mm);
3986 pgd_populate(mm, pud, new);
3987 } else
3988 pmd_free(mm, new);
3989#endif
3990 spin_unlock(ptl);
3991 return 0;
3992}
3993#endif
3994
3995static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
3996 pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
3997{
3998 pgd_t *pgd;
3999 p4d_t *p4d;
4000 pud_t *pud;
4001 pmd_t *pmd;
4002 pte_t *ptep;
4003
4004 pgd = pgd_offset(mm, address);
4005 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
4006 goto out;
4007
4008 p4d = p4d_offset(pgd, address);
4009 if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d)))
4010 goto out;
4011
4012 pud = pud_offset(p4d, address);
4013 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
4014 goto out;
4015
4016 pmd = pmd_offset(pud, address);
4017 VM_BUG_ON(pmd_trans_huge(*pmd));
4018
4019 if (pmd_huge(*pmd)) {
4020 if (!pmdpp)
4021 goto out;
4022
4023 *ptlp = pmd_lock(mm, pmd);
4024 if (pmd_huge(*pmd)) {
4025 *pmdpp = pmd;
4026 return 0;
4027 }
4028 spin_unlock(*ptlp);
4029 }
4030
4031 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
4032 goto out;
4033
4034 ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
4035 if (!ptep)
4036 goto out;
4037 if (!pte_present(*ptep))
4038 goto unlock;
4039 *ptepp = ptep;
4040 return 0;
4041unlock:
4042 pte_unmap_unlock(ptep, *ptlp);
4043out:
4044 return -EINVAL;
4045}
4046
4047static inline int follow_pte(struct mm_struct *mm, unsigned long address,
4048 pte_t **ptepp, spinlock_t **ptlp)
4049{
4050 int res;
4051
4052
4053 (void) __cond_lock(*ptlp,
4054 !(res = __follow_pte_pmd(mm, address, ptepp, NULL,
4055 ptlp)));
4056 return res;
4057}
4058
4059int follow_pte_pmd(struct mm_struct *mm, unsigned long address,
4060 pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
4061{
4062 int res;
4063
4064
4065 (void) __cond_lock(*ptlp,
4066 !(res = __follow_pte_pmd(mm, address, ptepp, pmdpp,
4067 ptlp)));
4068 return res;
4069}
4070EXPORT_SYMBOL(follow_pte_pmd);
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082int follow_pfn(struct vm_area_struct *vma, unsigned long address,
4083 unsigned long *pfn)
4084{
4085 int ret = -EINVAL;
4086 spinlock_t *ptl;
4087 pte_t *ptep;
4088
4089 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
4090 return ret;
4091
4092 ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
4093 if (ret)
4094 return ret;
4095 *pfn = pte_pfn(*ptep);
4096 pte_unmap_unlock(ptep, ptl);
4097 return 0;
4098}
4099EXPORT_SYMBOL(follow_pfn);
4100
4101#ifdef CONFIG_HAVE_IOREMAP_PROT
4102int follow_phys(struct vm_area_struct *vma,
4103 unsigned long address, unsigned int flags,
4104 unsigned long *prot, resource_size_t *phys)
4105{
4106 int ret = -EINVAL;
4107 pte_t *ptep, pte;
4108 spinlock_t *ptl;
4109
4110 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
4111 goto out;
4112
4113 if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
4114 goto out;
4115 pte = *ptep;
4116
4117 if ((flags & FOLL_WRITE) && !pte_write(pte))
4118 goto unlock;
4119
4120 *prot = pgprot_val(pte_pgprot(pte));
4121 *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
4122
4123 ret = 0;
4124unlock:
4125 pte_unmap_unlock(ptep, ptl);
4126out:
4127 return ret;
4128}
4129
4130int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
4131 void *buf, int len, int write)
4132{
4133 resource_size_t phys_addr;
4134 unsigned long prot = 0;
4135 void __iomem *maddr;
4136 int offset = addr & (PAGE_SIZE-1);
4137
4138 if (follow_phys(vma, addr, write, &prot, &phys_addr))
4139 return -EINVAL;
4140
4141 maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
4142 if (write)
4143 memcpy_toio(maddr + offset, buf, len);
4144 else
4145 memcpy_fromio(buf, maddr + offset, len);
4146 iounmap(maddr);
4147
4148 return len;
4149}
4150EXPORT_SYMBOL_GPL(generic_access_phys);
4151#endif
4152
4153
4154
4155
4156
4157int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
4158 unsigned long addr, void *buf, int len, unsigned int gup_flags)
4159{
4160 struct vm_area_struct *vma;
4161 void *old_buf = buf;
4162 int write = gup_flags & FOLL_WRITE;
4163
4164 down_read(&mm->mmap_sem);
4165
4166 while (len) {
4167 int bytes, ret, offset;
4168 void *maddr;
4169 struct page *page = NULL;
4170
4171 ret = get_user_pages_remote(tsk, mm, addr, 1,
4172 gup_flags, &page, &vma, NULL);
4173 if (ret <= 0) {
4174#ifndef CONFIG_HAVE_IOREMAP_PROT
4175 break;
4176#else
4177
4178
4179
4180
4181 vma = find_vma(mm, addr);
4182 if (!vma || vma->vm_start > addr)
4183 break;
4184 if (vma->vm_ops && vma->vm_ops->access)
4185 ret = vma->vm_ops->access(vma, addr, buf,
4186 len, write);
4187 if (ret <= 0)
4188 break;
4189 bytes = ret;
4190#endif
4191 } else {
4192 bytes = len;
4193 offset = addr & (PAGE_SIZE-1);
4194 if (bytes > PAGE_SIZE-offset)
4195 bytes = PAGE_SIZE-offset;
4196
4197 maddr = kmap(page);
4198 if (write) {
4199 copy_to_user_page(vma, page, addr,
4200 maddr + offset, buf, bytes);
4201 set_page_dirty_lock(page);
4202 } else {
4203 copy_from_user_page(vma, page, addr,
4204 buf, maddr + offset, bytes);
4205 }
4206 kunmap(page);
4207 put_page(page);
4208 }
4209 len -= bytes;
4210 buf += bytes;
4211 addr += bytes;
4212 }
4213 up_read(&mm->mmap_sem);
4214
4215 return buf - old_buf;
4216}
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228int access_remote_vm(struct mm_struct *mm, unsigned long addr,
4229 void *buf, int len, unsigned int gup_flags)
4230{
4231 return __access_remote_vm(NULL, mm, addr, buf, len, gup_flags);
4232}
4233
4234
4235
4236
4237
4238
4239int access_process_vm(struct task_struct *tsk, unsigned long addr,
4240 void *buf, int len, unsigned int gup_flags)
4241{
4242 struct mm_struct *mm;
4243 int ret;
4244
4245 mm = get_task_mm(tsk);
4246 if (!mm)
4247 return 0;
4248
4249 ret = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags);
4250
4251 mmput(mm);
4252
4253 return ret;
4254}
4255EXPORT_SYMBOL_GPL(access_process_vm);
4256
4257
4258
4259
4260void print_vma_addr(char *prefix, unsigned long ip)
4261{
4262 struct mm_struct *mm = current->mm;
4263 struct vm_area_struct *vma;
4264
4265
4266
4267
4268
4269 if (preempt_count())
4270 return;
4271
4272 down_read(&mm->mmap_sem);
4273 vma = find_vma(mm, ip);
4274 if (vma && vma->vm_file) {
4275 struct file *f = vma->vm_file;
4276 char *buf = (char *)__get_free_page(GFP_KERNEL);
4277 if (buf) {
4278 char *p;
4279
4280 p = file_path(f, buf, PAGE_SIZE);
4281 if (IS_ERR(p))
4282 p = "?";
4283 printk("%s%s[%lx+%lx]", prefix, kbasename(p),
4284 vma->vm_start,
4285 vma->vm_end - vma->vm_start);
4286 free_page((unsigned long)buf);
4287 }
4288 }
4289 up_read(&mm->mmap_sem);
4290}
4291
4292#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
4293void __might_fault(const char *file, int line)
4294{
4295
4296
4297
4298
4299
4300
4301 if (segment_eq(get_fs(), KERNEL_DS))
4302 return;
4303 if (pagefault_disabled())
4304 return;
4305 __might_sleep(file, line, 0);
4306#if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
4307 if (current->mm)
4308 might_lock_read(¤t->mm->mmap_sem);
4309#endif
4310}
4311EXPORT_SYMBOL(__might_fault);
4312#endif
4313
4314#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
4315static void clear_gigantic_page(struct page *page,
4316 unsigned long addr,
4317 unsigned int pages_per_huge_page)
4318{
4319 int i;
4320 struct page *p = page;
4321
4322 might_sleep();
4323 for (i = 0; i < pages_per_huge_page;
4324 i++, p = mem_map_next(p, page, i)) {
4325 cond_resched();
4326 clear_user_highpage(p, addr + i * PAGE_SIZE);
4327 }
4328}
4329void clear_huge_page(struct page *page,
4330 unsigned long addr, unsigned int pages_per_huge_page)
4331{
4332 int i;
4333
4334 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
4335 clear_gigantic_page(page, addr, pages_per_huge_page);
4336 return;
4337 }
4338
4339 might_sleep();
4340 for (i = 0; i < pages_per_huge_page; i++) {
4341 cond_resched();
4342 clear_user_highpage(page + i, addr + i * PAGE_SIZE);
4343 }
4344}
4345
4346static void copy_user_gigantic_page(struct page *dst, struct page *src,
4347 unsigned long addr,
4348 struct vm_area_struct *vma,
4349 unsigned int pages_per_huge_page)
4350{
4351 int i;
4352 struct page *dst_base = dst;
4353 struct page *src_base = src;
4354
4355 for (i = 0; i < pages_per_huge_page; ) {
4356 cond_resched();
4357 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
4358
4359 i++;
4360 dst = mem_map_next(dst, dst_base, i);
4361 src = mem_map_next(src, src_base, i);
4362 }
4363}
4364
4365void copy_user_huge_page(struct page *dst, struct page *src,
4366 unsigned long addr, struct vm_area_struct *vma,
4367 unsigned int pages_per_huge_page)
4368{
4369 int i;
4370
4371 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
4372 copy_user_gigantic_page(dst, src, addr, vma,
4373 pages_per_huge_page);
4374 return;
4375 }
4376
4377 might_sleep();
4378 for (i = 0; i < pages_per_huge_page; i++) {
4379 cond_resched();
4380 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
4381 }
4382}
4383
4384long copy_huge_page_from_user(struct page *dst_page,
4385 const void __user *usr_src,
4386 unsigned int pages_per_huge_page,
4387 bool allow_pagefault)
4388{
4389 void *src = (void *)usr_src;
4390 void *page_kaddr;
4391 unsigned long i, rc = 0;
4392 unsigned long ret_val = pages_per_huge_page * PAGE_SIZE;
4393
4394 for (i = 0; i < pages_per_huge_page; i++) {
4395 if (allow_pagefault)
4396 page_kaddr = kmap(dst_page + i);
4397 else
4398 page_kaddr = kmap_atomic(dst_page + i);
4399 rc = copy_from_user(page_kaddr,
4400 (const void __user *)(src + i * PAGE_SIZE),
4401 PAGE_SIZE);
4402 if (allow_pagefault)
4403 kunmap(dst_page + i);
4404 else
4405 kunmap_atomic(page_kaddr);
4406
4407 ret_val -= (PAGE_SIZE - rc);
4408 if (rc)
4409 break;
4410
4411 cond_resched();
4412 }
4413 return ret_val;
4414}
4415#endif
4416
4417#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS
4418
4419static struct kmem_cache *page_ptl_cachep;
4420
4421void __init ptlock_cache_init(void)
4422{
4423 page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0,
4424 SLAB_PANIC, NULL);
4425}
4426
4427bool ptlock_alloc(struct page *page)
4428{
4429 spinlock_t *ptl;
4430
4431 ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
4432 if (!ptl)
4433 return false;
4434 page->ptl = ptl;
4435 return true;
4436}
4437
4438void ptlock_free(struct page *page)
4439{
4440 kmem_cache_free(page_ptl_cachep, page->ptl);
4441}
4442#endif
4443