1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41#include <linux/kernel_stat.h>
42#include <linux/mm.h>
43#include <linux/sched/mm.h>
44#include <linux/sched/coredump.h>
45#include <linux/sched/numa_balancing.h>
46#include <linux/sched/task.h>
47#include <linux/hugetlb.h>
48#include <linux/mman.h>
49#include <linux/swap.h>
50#include <linux/highmem.h>
51#include <linux/pagemap.h>
52#include <linux/memremap.h>
53#include <linux/ksm.h>
54#include <linux/rmap.h>
55#include <linux/export.h>
56#include <linux/delayacct.h>
57#include <linux/init.h>
58#include <linux/pfn_t.h>
59#include <linux/writeback.h>
60#include <linux/memcontrol.h>
61#include <linux/mmu_notifier.h>
62#include <linux/kallsyms.h>
63#include <linux/swapops.h>
64#include <linux/elf.h>
65#include <linux/gfp.h>
66#include <linux/migrate.h>
67#include <linux/string.h>
68#include <linux/dma-debug.h>
69#include <linux/debugfs.h>
70#include <linux/userfaultfd_k.h>
71#include <linux/dax.h>
72#include <linux/oom.h>
73
74#include <asm/io.h>
75#include <asm/mmu_context.h>
76#include <asm/pgalloc.h>
77#include <linux/uaccess.h>
78#include <asm/tlb.h>
79#include <asm/tlbflush.h>
80#include <asm/pgtable.h>
81
82#include "internal.h"
83
84#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
85#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
86#endif
87
88#ifndef CONFIG_NEED_MULTIPLE_NODES
89
90unsigned long max_mapnr;
91EXPORT_SYMBOL(max_mapnr);
92
93struct page *mem_map;
94EXPORT_SYMBOL(mem_map);
95#endif
96
97
98
99
100
101
102
103
104void *high_memory;
105EXPORT_SYMBOL(high_memory);
106
107
108
109
110
111
112
113int randomize_va_space __read_mostly =
114#ifdef CONFIG_COMPAT_BRK
115 1;
116#else
117 2;
118#endif
119
120static int __init disable_randmaps(char *s)
121{
122 randomize_va_space = 0;
123 return 1;
124}
125__setup("norandmaps", disable_randmaps);
126
127unsigned long zero_pfn __read_mostly;
128EXPORT_SYMBOL(zero_pfn);
129
130unsigned long highest_memmap_pfn __read_mostly;
131
132
133
134
135static int __init init_zero_pfn(void)
136{
137 zero_pfn = page_to_pfn(ZERO_PAGE(0));
138 return 0;
139}
140core_initcall(init_zero_pfn);
141
142
143#if defined(SPLIT_RSS_COUNTING)
144
145void sync_mm_rss(struct mm_struct *mm)
146{
147 int i;
148
149 for (i = 0; i < NR_MM_COUNTERS; i++) {
150 if (current->rss_stat.count[i]) {
151 add_mm_counter(mm, i, current->rss_stat.count[i]);
152 current->rss_stat.count[i] = 0;
153 }
154 }
155 current->rss_stat.events = 0;
156}
157
158static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
159{
160 struct task_struct *task = current;
161
162 if (likely(task->mm == mm))
163 task->rss_stat.count[member] += val;
164 else
165 add_mm_counter(mm, member, val);
166}
167#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
168#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
169
170
171#define TASK_RSS_EVENTS_THRESH (64)
172static void check_sync_rss_stat(struct task_struct *task)
173{
174 if (unlikely(task != current))
175 return;
176 if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
177 sync_mm_rss(task->mm);
178}
179#else
180
181#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
182#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
183
184static void check_sync_rss_stat(struct task_struct *task)
185{
186}
187
188#endif
189
190#ifdef HAVE_GENERIC_MMU_GATHER
191
192static bool tlb_next_batch(struct mmu_gather *tlb)
193{
194 struct mmu_gather_batch *batch;
195
196 batch = tlb->active;
197 if (batch->next) {
198 tlb->active = batch->next;
199 return true;
200 }
201
202 if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
203 return false;
204
205 batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
206 if (!batch)
207 return false;
208
209 tlb->batch_count++;
210 batch->next = NULL;
211 batch->nr = 0;
212 batch->max = MAX_GATHER_BATCH;
213
214 tlb->active->next = batch;
215 tlb->active = batch;
216
217 return true;
218}
219
220void arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
221 unsigned long start, unsigned long end)
222{
223 tlb->mm = mm;
224
225
226 tlb->fullmm = !(start | (end+1));
227 tlb->need_flush_all = 0;
228 tlb->local.next = NULL;
229 tlb->local.nr = 0;
230 tlb->local.max = ARRAY_SIZE(tlb->__pages);
231 tlb->active = &tlb->local;
232 tlb->batch_count = 0;
233
234#ifdef CONFIG_HAVE_RCU_TABLE_FREE
235 tlb->batch = NULL;
236#endif
237 tlb->page_size = 0;
238
239 __tlb_reset_range(tlb);
240}
241
242static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
243{
244 if (!tlb->end)
245 return;
246
247 tlb_flush(tlb);
248 mmu_notifier_invalidate_range(tlb->mm, tlb->start, tlb->end);
249#ifdef CONFIG_HAVE_RCU_TABLE_FREE
250 tlb_table_flush(tlb);
251#endif
252 __tlb_reset_range(tlb);
253}
254
255static void tlb_flush_mmu_free(struct mmu_gather *tlb)
256{
257 struct mmu_gather_batch *batch;
258
259 for (batch = &tlb->local; batch && batch->nr; batch = batch->next) {
260 free_pages_and_swap_cache(batch->pages, batch->nr);
261 batch->nr = 0;
262 }
263 tlb->active = &tlb->local;
264}
265
266void tlb_flush_mmu(struct mmu_gather *tlb)
267{
268 tlb_flush_mmu_tlbonly(tlb);
269 tlb_flush_mmu_free(tlb);
270}
271
272
273
274
275
276void arch_tlb_finish_mmu(struct mmu_gather *tlb,
277 unsigned long start, unsigned long end, bool force)
278{
279 struct mmu_gather_batch *batch, *next;
280
281 if (force)
282 __tlb_adjust_range(tlb, start, end - start);
283
284 tlb_flush_mmu(tlb);
285
286
287 check_pgt_cache();
288
289 for (batch = tlb->local.next; batch; batch = next) {
290 next = batch->next;
291 free_pages((unsigned long)batch, 0);
292 }
293 tlb->local.next = NULL;
294}
295
296
297
298
299
300
301
302
303bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size)
304{
305 struct mmu_gather_batch *batch;
306
307 VM_BUG_ON(!tlb->end);
308 VM_WARN_ON(tlb->page_size != page_size);
309
310 batch = tlb->active;
311
312
313
314
315 batch->pages[batch->nr++] = page;
316 if (batch->nr == batch->max) {
317 if (!tlb_next_batch(tlb))
318 return true;
319 batch = tlb->active;
320 }
321 VM_BUG_ON_PAGE(batch->nr > batch->max, page);
322
323 return false;
324}
325
326#endif
327
328#ifdef CONFIG_HAVE_RCU_TABLE_FREE
329
330
331
332
333
334static void tlb_remove_table_smp_sync(void *arg)
335{
336
337}
338
339static void tlb_remove_table_one(void *table)
340{
341
342
343
344
345
346
347
348 smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
349 __tlb_remove_table(table);
350}
351
352static void tlb_remove_table_rcu(struct rcu_head *head)
353{
354 struct mmu_table_batch *batch;
355 int i;
356
357 batch = container_of(head, struct mmu_table_batch, rcu);
358
359 for (i = 0; i < batch->nr; i++)
360 __tlb_remove_table(batch->tables[i]);
361
362 free_page((unsigned long)batch);
363}
364
365void tlb_table_flush(struct mmu_gather *tlb)
366{
367 struct mmu_table_batch **batch = &tlb->batch;
368
369 if (*batch) {
370 call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
371 *batch = NULL;
372 }
373}
374
375void tlb_remove_table(struct mmu_gather *tlb, void *table)
376{
377 struct mmu_table_batch **batch = &tlb->batch;
378
379
380
381
382
383 if (atomic_read(&tlb->mm->mm_users) < 2) {
384 __tlb_remove_table(table);
385 return;
386 }
387
388 if (*batch == NULL) {
389 *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
390 if (*batch == NULL) {
391 tlb_remove_table_one(table);
392 return;
393 }
394 (*batch)->nr = 0;
395 }
396 (*batch)->tables[(*batch)->nr++] = table;
397 if ((*batch)->nr == MAX_TABLE_BATCH)
398 tlb_table_flush(tlb);
399}
400
401#endif
402
403
404
405
406
407
408void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
409 unsigned long start, unsigned long end)
410{
411 arch_tlb_gather_mmu(tlb, mm, start, end);
412 inc_tlb_flush_pending(tlb->mm);
413}
414
415void tlb_finish_mmu(struct mmu_gather *tlb,
416 unsigned long start, unsigned long end)
417{
418
419
420
421
422
423
424
425 bool force = mm_tlb_flush_nested(tlb->mm);
426
427 arch_tlb_finish_mmu(tlb, start, end, force);
428 dec_tlb_flush_pending(tlb->mm);
429}
430
431
432
433
434
435static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
436 unsigned long addr)
437{
438 pgtable_t token = pmd_pgtable(*pmd);
439 pmd_clear(pmd);
440 pte_free_tlb(tlb, token, addr);
441 mm_dec_nr_ptes(tlb->mm);
442}
443
444static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
445 unsigned long addr, unsigned long end,
446 unsigned long floor, unsigned long ceiling)
447{
448 pmd_t *pmd;
449 unsigned long next;
450 unsigned long start;
451
452 start = addr;
453 pmd = pmd_offset(pud, addr);
454 do {
455 next = pmd_addr_end(addr, end);
456 if (pmd_none_or_clear_bad(pmd))
457 continue;
458 free_pte_range(tlb, pmd, addr);
459 } while (pmd++, addr = next, addr != end);
460
461 start &= PUD_MASK;
462 if (start < floor)
463 return;
464 if (ceiling) {
465 ceiling &= PUD_MASK;
466 if (!ceiling)
467 return;
468 }
469 if (end - 1 > ceiling - 1)
470 return;
471
472 pmd = pmd_offset(pud, start);
473 pud_clear(pud);
474 pmd_free_tlb(tlb, pmd, start);
475 mm_dec_nr_pmds(tlb->mm);
476}
477
478static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
479 unsigned long addr, unsigned long end,
480 unsigned long floor, unsigned long ceiling)
481{
482 pud_t *pud;
483 unsigned long next;
484 unsigned long start;
485
486 start = addr;
487 pud = pud_offset(p4d, addr);
488 do {
489 next = pud_addr_end(addr, end);
490 if (pud_none_or_clear_bad(pud))
491 continue;
492 free_pmd_range(tlb, pud, addr, next, floor, ceiling);
493 } while (pud++, addr = next, addr != end);
494
495 start &= P4D_MASK;
496 if (start < floor)
497 return;
498 if (ceiling) {
499 ceiling &= P4D_MASK;
500 if (!ceiling)
501 return;
502 }
503 if (end - 1 > ceiling - 1)
504 return;
505
506 pud = pud_offset(p4d, start);
507 p4d_clear(p4d);
508 pud_free_tlb(tlb, pud, start);
509 mm_dec_nr_puds(tlb->mm);
510}
511
512static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
513 unsigned long addr, unsigned long end,
514 unsigned long floor, unsigned long ceiling)
515{
516 p4d_t *p4d;
517 unsigned long next;
518 unsigned long start;
519
520 start = addr;
521 p4d = p4d_offset(pgd, addr);
522 do {
523 next = p4d_addr_end(addr, end);
524 if (p4d_none_or_clear_bad(p4d))
525 continue;
526 free_pud_range(tlb, p4d, addr, next, floor, ceiling);
527 } while (p4d++, addr = next, addr != end);
528
529 start &= PGDIR_MASK;
530 if (start < floor)
531 return;
532 if (ceiling) {
533 ceiling &= PGDIR_MASK;
534 if (!ceiling)
535 return;
536 }
537 if (end - 1 > ceiling - 1)
538 return;
539
540 p4d = p4d_offset(pgd, start);
541 pgd_clear(pgd);
542 p4d_free_tlb(tlb, p4d, start);
543}
544
545
546
547
548void free_pgd_range(struct mmu_gather *tlb,
549 unsigned long addr, unsigned long end,
550 unsigned long floor, unsigned long ceiling)
551{
552 pgd_t *pgd;
553 unsigned long next;
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581 addr &= PMD_MASK;
582 if (addr < floor) {
583 addr += PMD_SIZE;
584 if (!addr)
585 return;
586 }
587 if (ceiling) {
588 ceiling &= PMD_MASK;
589 if (!ceiling)
590 return;
591 }
592 if (end - 1 > ceiling - 1)
593 end -= PMD_SIZE;
594 if (addr > end - 1)
595 return;
596
597
598
599
600 tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
601 pgd = pgd_offset(tlb->mm, addr);
602 do {
603 next = pgd_addr_end(addr, end);
604 if (pgd_none_or_clear_bad(pgd))
605 continue;
606 free_p4d_range(tlb, pgd, addr, next, floor, ceiling);
607 } while (pgd++, addr = next, addr != end);
608}
609
610void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
611 unsigned long floor, unsigned long ceiling)
612{
613 while (vma) {
614 struct vm_area_struct *next = vma->vm_next;
615 unsigned long addr = vma->vm_start;
616
617
618
619
620
621 unlink_anon_vmas(vma);
622 unlink_file_vma(vma);
623
624 if (is_vm_hugetlb_page(vma)) {
625 hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
626 floor, next ? next->vm_start : ceiling);
627 } else {
628
629
630
631 while (next && next->vm_start <= vma->vm_end + PMD_SIZE
632 && !is_vm_hugetlb_page(next)) {
633 vma = next;
634 next = vma->vm_next;
635 unlink_anon_vmas(vma);
636 unlink_file_vma(vma);
637 }
638 free_pgd_range(tlb, addr, vma->vm_end,
639 floor, next ? next->vm_start : ceiling);
640 }
641 vma = next;
642 }
643}
644
645int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
646{
647 spinlock_t *ptl;
648 pgtable_t new = pte_alloc_one(mm, address);
649 if (!new)
650 return -ENOMEM;
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665 smp_wmb();
666
667 ptl = pmd_lock(mm, pmd);
668 if (likely(pmd_none(*pmd))) {
669 mm_inc_nr_ptes(mm);
670 pmd_populate(mm, pmd, new);
671 new = NULL;
672 }
673 spin_unlock(ptl);
674 if (new)
675 pte_free(mm, new);
676 return 0;
677}
678
679int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
680{
681 pte_t *new = pte_alloc_one_kernel(&init_mm, address);
682 if (!new)
683 return -ENOMEM;
684
685 smp_wmb();
686
687 spin_lock(&init_mm.page_table_lock);
688 if (likely(pmd_none(*pmd))) {
689 pmd_populate_kernel(&init_mm, pmd, new);
690 new = NULL;
691 }
692 spin_unlock(&init_mm.page_table_lock);
693 if (new)
694 pte_free_kernel(&init_mm, new);
695 return 0;
696}
697
698static inline void init_rss_vec(int *rss)
699{
700 memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
701}
702
703static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
704{
705 int i;
706
707 if (current->mm == mm)
708 sync_mm_rss(mm);
709 for (i = 0; i < NR_MM_COUNTERS; i++)
710 if (rss[i])
711 add_mm_counter(mm, i, rss[i]);
712}
713
714
715
716
717
718
719
720
721static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
722 pte_t pte, struct page *page)
723{
724 pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
725 p4d_t *p4d = p4d_offset(pgd, addr);
726 pud_t *pud = pud_offset(p4d, addr);
727 pmd_t *pmd = pmd_offset(pud, addr);
728 struct address_space *mapping;
729 pgoff_t index;
730 static unsigned long resume;
731 static unsigned long nr_shown;
732 static unsigned long nr_unshown;
733
734
735
736
737
738 if (nr_shown == 60) {
739 if (time_before(jiffies, resume)) {
740 nr_unshown++;
741 return;
742 }
743 if (nr_unshown) {
744 pr_alert("BUG: Bad page map: %lu messages suppressed\n",
745 nr_unshown);
746 nr_unshown = 0;
747 }
748 nr_shown = 0;
749 }
750 if (nr_shown++ == 0)
751 resume = jiffies + 60 * HZ;
752
753 mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
754 index = linear_page_index(vma, addr);
755
756 pr_alert("BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
757 current->comm,
758 (long long)pte_val(pte), (long long)pmd_val(*pmd));
759 if (page)
760 dump_page(page, "bad pte");
761 pr_alert("addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
762 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
763
764
765
766 pr_alert("file:%pD fault:%pf mmap:%pf readpage:%pf\n",
767 vma->vm_file,
768 vma->vm_ops ? vma->vm_ops->fault : NULL,
769 vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
770 mapping ? mapping->a_ops->readpage : NULL);
771 dump_stack();
772 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
773}
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817#ifdef __HAVE_ARCH_PTE_SPECIAL
818# define HAVE_PTE_SPECIAL 1
819#else
820# define HAVE_PTE_SPECIAL 0
821#endif
822struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
823 pte_t pte, bool with_public_device)
824{
825 unsigned long pfn = pte_pfn(pte);
826
827 if (HAVE_PTE_SPECIAL) {
828 if (likely(!pte_special(pte)))
829 goto check_pfn;
830 if (vma->vm_ops && vma->vm_ops->find_special_page)
831 return vma->vm_ops->find_special_page(vma, addr);
832 if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
833 return NULL;
834 if (is_zero_pfn(pfn))
835 return NULL;
836
837
838
839
840
841
842
843
844
845
846
847
848
849 if (likely(pfn <= highest_memmap_pfn)) {
850 struct page *page = pfn_to_page(pfn);
851
852 if (is_device_public_page(page)) {
853 if (with_public_device)
854 return page;
855 return NULL;
856 }
857 }
858 print_bad_pte(vma, addr, pte, NULL);
859 return NULL;
860 }
861
862
863
864 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
865 if (vma->vm_flags & VM_MIXEDMAP) {
866 if (!pfn_valid(pfn))
867 return NULL;
868 goto out;
869 } else {
870 unsigned long off;
871 off = (addr - vma->vm_start) >> PAGE_SHIFT;
872 if (pfn == vma->vm_pgoff + off)
873 return NULL;
874 if (!is_cow_mapping(vma->vm_flags))
875 return NULL;
876 }
877 }
878
879 if (is_zero_pfn(pfn))
880 return NULL;
881check_pfn:
882 if (unlikely(pfn > highest_memmap_pfn)) {
883 print_bad_pte(vma, addr, pte, NULL);
884 return NULL;
885 }
886
887
888
889
890
891out:
892 return pfn_to_page(pfn);
893}
894
895#ifdef CONFIG_TRANSPARENT_HUGEPAGE
896struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
897 pmd_t pmd)
898{
899 unsigned long pfn = pmd_pfn(pmd);
900
901
902
903
904
905
906 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
907 if (vma->vm_flags & VM_MIXEDMAP) {
908 if (!pfn_valid(pfn))
909 return NULL;
910 goto out;
911 } else {
912 unsigned long off;
913 off = (addr - vma->vm_start) >> PAGE_SHIFT;
914 if (pfn == vma->vm_pgoff + off)
915 return NULL;
916 if (!is_cow_mapping(vma->vm_flags))
917 return NULL;
918 }
919 }
920
921 if (is_zero_pfn(pfn))
922 return NULL;
923 if (unlikely(pfn > highest_memmap_pfn))
924 return NULL;
925
926
927
928
929
930out:
931 return pfn_to_page(pfn);
932}
933#endif
934
935
936
937
938
939
940
941static inline unsigned long
942copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
943 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
944 unsigned long addr, int *rss)
945{
946 unsigned long vm_flags = vma->vm_flags;
947 pte_t pte = *src_pte;
948 struct page *page;
949
950
951 if (unlikely(!pte_present(pte))) {
952 swp_entry_t entry = pte_to_swp_entry(pte);
953
954 if (likely(!non_swap_entry(entry))) {
955 if (swap_duplicate(entry) < 0)
956 return entry.val;
957
958
959 if (unlikely(list_empty(&dst_mm->mmlist))) {
960 spin_lock(&mmlist_lock);
961 if (list_empty(&dst_mm->mmlist))
962 list_add(&dst_mm->mmlist,
963 &src_mm->mmlist);
964 spin_unlock(&mmlist_lock);
965 }
966 rss[MM_SWAPENTS]++;
967 } else if (is_migration_entry(entry)) {
968 page = migration_entry_to_page(entry);
969
970 rss[mm_counter(page)]++;
971
972 if (is_write_migration_entry(entry) &&
973 is_cow_mapping(vm_flags)) {
974
975
976
977
978 make_migration_entry_read(&entry);
979 pte = swp_entry_to_pte(entry);
980 if (pte_swp_soft_dirty(*src_pte))
981 pte = pte_swp_mksoft_dirty(pte);
982 set_pte_at(src_mm, addr, src_pte, pte);
983 }
984 } else if (is_device_private_entry(entry)) {
985 page = device_private_entry_to_page(entry);
986
987
988
989
990
991
992
993
994
995
996 get_page(page);
997 rss[mm_counter(page)]++;
998 page_dup_rmap(page, false);
999
1000
1001
1002
1003
1004
1005
1006
1007 if (is_write_device_private_entry(entry) &&
1008 is_cow_mapping(vm_flags)) {
1009 make_device_private_entry_read(&entry);
1010 pte = swp_entry_to_pte(entry);
1011 set_pte_at(src_mm, addr, src_pte, pte);
1012 }
1013 }
1014 goto out_set_pte;
1015 }
1016
1017
1018
1019
1020
1021 if (is_cow_mapping(vm_flags)) {
1022 ptep_set_wrprotect(src_mm, addr, src_pte);
1023 pte = pte_wrprotect(pte);
1024 }
1025
1026
1027
1028
1029
1030 if (vm_flags & VM_SHARED)
1031 pte = pte_mkclean(pte);
1032 pte = pte_mkold(pte);
1033
1034 page = vm_normal_page(vma, addr, pte);
1035 if (page) {
1036 get_page(page);
1037 page_dup_rmap(page, false);
1038 rss[mm_counter(page)]++;
1039 } else if (pte_devmap(pte)) {
1040 page = pte_page(pte);
1041
1042
1043
1044
1045
1046
1047 if (is_device_public_page(page)) {
1048 get_page(page);
1049 page_dup_rmap(page, false);
1050 rss[mm_counter(page)]++;
1051 }
1052 }
1053
1054out_set_pte:
1055 set_pte_at(dst_mm, addr, dst_pte, pte);
1056 return 0;
1057}
1058
1059static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1060 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
1061 unsigned long addr, unsigned long end)
1062{
1063 pte_t *orig_src_pte, *orig_dst_pte;
1064 pte_t *src_pte, *dst_pte;
1065 spinlock_t *src_ptl, *dst_ptl;
1066 int progress = 0;
1067 int rss[NR_MM_COUNTERS];
1068 swp_entry_t entry = (swp_entry_t){0};
1069
1070again:
1071 init_rss_vec(rss);
1072
1073 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
1074 if (!dst_pte)
1075 return -ENOMEM;
1076 src_pte = pte_offset_map(src_pmd, addr);
1077 src_ptl = pte_lockptr(src_mm, src_pmd);
1078 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
1079 orig_src_pte = src_pte;
1080 orig_dst_pte = dst_pte;
1081 arch_enter_lazy_mmu_mode();
1082
1083 do {
1084
1085
1086
1087
1088 if (progress >= 32) {
1089 progress = 0;
1090 if (need_resched() ||
1091 spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
1092 break;
1093 }
1094 if (pte_none(*src_pte)) {
1095 progress++;
1096 continue;
1097 }
1098 entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
1099 vma, addr, rss);
1100 if (entry.val)
1101 break;
1102 progress += 8;
1103 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
1104
1105 arch_leave_lazy_mmu_mode();
1106 spin_unlock(src_ptl);
1107 pte_unmap(orig_src_pte);
1108 add_mm_rss_vec(dst_mm, rss);
1109 pte_unmap_unlock(orig_dst_pte, dst_ptl);
1110 cond_resched();
1111
1112 if (entry.val) {
1113 if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
1114 return -ENOMEM;
1115 progress = 0;
1116 }
1117 if (addr != end)
1118 goto again;
1119 return 0;
1120}
1121
1122static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1123 pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
1124 unsigned long addr, unsigned long end)
1125{
1126 pmd_t *src_pmd, *dst_pmd;
1127 unsigned long next;
1128
1129 dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
1130 if (!dst_pmd)
1131 return -ENOMEM;
1132 src_pmd = pmd_offset(src_pud, addr);
1133 do {
1134 next = pmd_addr_end(addr, end);
1135 if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)
1136 || pmd_devmap(*src_pmd)) {
1137 int err;
1138 VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, vma);
1139 err = copy_huge_pmd(dst_mm, src_mm,
1140 dst_pmd, src_pmd, addr, vma);
1141 if (err == -ENOMEM)
1142 return -ENOMEM;
1143 if (!err)
1144 continue;
1145
1146 }
1147 if (pmd_none_or_clear_bad(src_pmd))
1148 continue;
1149 if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
1150 vma, addr, next))
1151 return -ENOMEM;
1152 } while (dst_pmd++, src_pmd++, addr = next, addr != end);
1153 return 0;
1154}
1155
1156static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1157 p4d_t *dst_p4d, p4d_t *src_p4d, struct vm_area_struct *vma,
1158 unsigned long addr, unsigned long end)
1159{
1160 pud_t *src_pud, *dst_pud;
1161 unsigned long next;
1162
1163 dst_pud = pud_alloc(dst_mm, dst_p4d, addr);
1164 if (!dst_pud)
1165 return -ENOMEM;
1166 src_pud = pud_offset(src_p4d, addr);
1167 do {
1168 next = pud_addr_end(addr, end);
1169 if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) {
1170 int err;
1171
1172 VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, vma);
1173 err = copy_huge_pud(dst_mm, src_mm,
1174 dst_pud, src_pud, addr, vma);
1175 if (err == -ENOMEM)
1176 return -ENOMEM;
1177 if (!err)
1178 continue;
1179
1180 }
1181 if (pud_none_or_clear_bad(src_pud))
1182 continue;
1183 if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
1184 vma, addr, next))
1185 return -ENOMEM;
1186 } while (dst_pud++, src_pud++, addr = next, addr != end);
1187 return 0;
1188}
1189
1190static inline int copy_p4d_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1191 pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
1192 unsigned long addr, unsigned long end)
1193{
1194 p4d_t *src_p4d, *dst_p4d;
1195 unsigned long next;
1196
1197 dst_p4d = p4d_alloc(dst_mm, dst_pgd, addr);
1198 if (!dst_p4d)
1199 return -ENOMEM;
1200 src_p4d = p4d_offset(src_pgd, addr);
1201 do {
1202 next = p4d_addr_end(addr, end);
1203 if (p4d_none_or_clear_bad(src_p4d))
1204 continue;
1205 if (copy_pud_range(dst_mm, src_mm, dst_p4d, src_p4d,
1206 vma, addr, next))
1207 return -ENOMEM;
1208 } while (dst_p4d++, src_p4d++, addr = next, addr != end);
1209 return 0;
1210}
1211
1212int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1213 struct vm_area_struct *vma)
1214{
1215 pgd_t *src_pgd, *dst_pgd;
1216 unsigned long next;
1217 unsigned long addr = vma->vm_start;
1218 unsigned long end = vma->vm_end;
1219 unsigned long mmun_start;
1220 unsigned long mmun_end;
1221 bool is_cow;
1222 int ret;
1223
1224
1225
1226
1227
1228
1229
1230 if (!(vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) &&
1231 !vma->anon_vma)
1232 return 0;
1233
1234 if (is_vm_hugetlb_page(vma))
1235 return copy_hugetlb_page_range(dst_mm, src_mm, vma);
1236
1237 if (unlikely(vma->vm_flags & VM_PFNMAP)) {
1238
1239
1240
1241
1242 ret = track_pfn_copy(vma);
1243 if (ret)
1244 return ret;
1245 }
1246
1247
1248
1249
1250
1251
1252
1253 is_cow = is_cow_mapping(vma->vm_flags);
1254 mmun_start = addr;
1255 mmun_end = end;
1256 if (is_cow)
1257 mmu_notifier_invalidate_range_start(src_mm, mmun_start,
1258 mmun_end);
1259
1260 ret = 0;
1261 dst_pgd = pgd_offset(dst_mm, addr);
1262 src_pgd = pgd_offset(src_mm, addr);
1263 do {
1264 next = pgd_addr_end(addr, end);
1265 if (pgd_none_or_clear_bad(src_pgd))
1266 continue;
1267 if (unlikely(copy_p4d_range(dst_mm, src_mm, dst_pgd, src_pgd,
1268 vma, addr, next))) {
1269 ret = -ENOMEM;
1270 break;
1271 }
1272 } while (dst_pgd++, src_pgd++, addr = next, addr != end);
1273
1274 if (is_cow)
1275 mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end);
1276 return ret;
1277}
1278
1279static unsigned long zap_pte_range(struct mmu_gather *tlb,
1280 struct vm_area_struct *vma, pmd_t *pmd,
1281 unsigned long addr, unsigned long end,
1282 struct zap_details *details)
1283{
1284 struct mm_struct *mm = tlb->mm;
1285 int force_flush = 0;
1286 int rss[NR_MM_COUNTERS];
1287 spinlock_t *ptl;
1288 pte_t *start_pte;
1289 pte_t *pte;
1290 swp_entry_t entry;
1291
1292 tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
1293again:
1294 init_rss_vec(rss);
1295 start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
1296 pte = start_pte;
1297 flush_tlb_batched_pending(mm);
1298 arch_enter_lazy_mmu_mode();
1299 do {
1300 pte_t ptent = *pte;
1301 if (pte_none(ptent))
1302 continue;
1303
1304 if (pte_present(ptent)) {
1305 struct page *page;
1306
1307 page = _vm_normal_page(vma, addr, ptent, true);
1308 if (unlikely(details) && page) {
1309
1310
1311
1312
1313
1314 if (details->check_mapping &&
1315 details->check_mapping != page_rmapping(page))
1316 continue;
1317 }
1318 ptent = ptep_get_and_clear_full(mm, addr, pte,
1319 tlb->fullmm);
1320 tlb_remove_tlb_entry(tlb, pte, addr);
1321 if (unlikely(!page))
1322 continue;
1323
1324 if (!PageAnon(page)) {
1325 if (pte_dirty(ptent)) {
1326 force_flush = 1;
1327 set_page_dirty(page);
1328 }
1329 if (pte_young(ptent) &&
1330 likely(!(vma->vm_flags & VM_SEQ_READ)))
1331 mark_page_accessed(page);
1332 }
1333 rss[mm_counter(page)]--;
1334 page_remove_rmap(page, false);
1335 if (unlikely(page_mapcount(page) < 0))
1336 print_bad_pte(vma, addr, ptent, page);
1337 if (unlikely(__tlb_remove_page(tlb, page))) {
1338 force_flush = 1;
1339 addr += PAGE_SIZE;
1340 break;
1341 }
1342 continue;
1343 }
1344
1345 entry = pte_to_swp_entry(ptent);
1346 if (non_swap_entry(entry) && is_device_private_entry(entry)) {
1347 struct page *page = device_private_entry_to_page(entry);
1348
1349 if (unlikely(details && details->check_mapping)) {
1350
1351
1352
1353
1354
1355 if (details->check_mapping !=
1356 page_rmapping(page))
1357 continue;
1358 }
1359
1360 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
1361 rss[mm_counter(page)]--;
1362 page_remove_rmap(page, false);
1363 put_page(page);
1364 continue;
1365 }
1366
1367
1368 if (unlikely(details))
1369 continue;
1370
1371 entry = pte_to_swp_entry(ptent);
1372 if (!non_swap_entry(entry))
1373 rss[MM_SWAPENTS]--;
1374 else if (is_migration_entry(entry)) {
1375 struct page *page;
1376
1377 page = migration_entry_to_page(entry);
1378 rss[mm_counter(page)]--;
1379 }
1380 if (unlikely(!free_swap_and_cache(entry)))
1381 print_bad_pte(vma, addr, ptent, NULL);
1382 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
1383 } while (pte++, addr += PAGE_SIZE, addr != end);
1384
1385 add_mm_rss_vec(mm, rss);
1386 arch_leave_lazy_mmu_mode();
1387
1388
1389 if (force_flush)
1390 tlb_flush_mmu_tlbonly(tlb);
1391 pte_unmap_unlock(start_pte, ptl);
1392
1393
1394
1395
1396
1397
1398
1399 if (force_flush) {
1400 force_flush = 0;
1401 tlb_flush_mmu_free(tlb);
1402 if (addr != end)
1403 goto again;
1404 }
1405
1406 return addr;
1407}
1408
1409static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1410 struct vm_area_struct *vma, pud_t *pud,
1411 unsigned long addr, unsigned long end,
1412 struct zap_details *details)
1413{
1414 pmd_t *pmd;
1415 unsigned long next;
1416
1417 pmd = pmd_offset(pud, addr);
1418 do {
1419 next = pmd_addr_end(addr, end);
1420 if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
1421 if (next - addr != HPAGE_PMD_SIZE) {
1422 VM_BUG_ON_VMA(vma_is_anonymous(vma) &&
1423 !rwsem_is_locked(&tlb->mm->mmap_sem), vma);
1424 __split_huge_pmd(vma, pmd, addr, false, NULL);
1425 } else if (zap_huge_pmd(tlb, vma, pmd, addr))
1426 goto next;
1427
1428 }
1429
1430
1431
1432
1433
1434
1435
1436 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
1437 goto next;
1438 next = zap_pte_range(tlb, vma, pmd, addr, next, details);
1439next:
1440 cond_resched();
1441 } while (pmd++, addr = next, addr != end);
1442
1443 return addr;
1444}
1445
1446static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1447 struct vm_area_struct *vma, p4d_t *p4d,
1448 unsigned long addr, unsigned long end,
1449 struct zap_details *details)
1450{
1451 pud_t *pud;
1452 unsigned long next;
1453
1454 pud = pud_offset(p4d, addr);
1455 do {
1456 next = pud_addr_end(addr, end);
1457 if (pud_trans_huge(*pud) || pud_devmap(*pud)) {
1458 if (next - addr != HPAGE_PUD_SIZE) {
1459 VM_BUG_ON_VMA(!rwsem_is_locked(&tlb->mm->mmap_sem), vma);
1460 split_huge_pud(vma, pud, addr);
1461 } else if (zap_huge_pud(tlb, vma, pud, addr))
1462 goto next;
1463
1464 }
1465 if (pud_none_or_clear_bad(pud))
1466 continue;
1467 next = zap_pmd_range(tlb, vma, pud, addr, next, details);
1468next:
1469 cond_resched();
1470 } while (pud++, addr = next, addr != end);
1471
1472 return addr;
1473}
1474
1475static inline unsigned long zap_p4d_range(struct mmu_gather *tlb,
1476 struct vm_area_struct *vma, pgd_t *pgd,
1477 unsigned long addr, unsigned long end,
1478 struct zap_details *details)
1479{
1480 p4d_t *p4d;
1481 unsigned long next;
1482
1483 p4d = p4d_offset(pgd, addr);
1484 do {
1485 next = p4d_addr_end(addr, end);
1486 if (p4d_none_or_clear_bad(p4d))
1487 continue;
1488 next = zap_pud_range(tlb, vma, p4d, addr, next, details);
1489 } while (p4d++, addr = next, addr != end);
1490
1491 return addr;
1492}
1493
1494void unmap_page_range(struct mmu_gather *tlb,
1495 struct vm_area_struct *vma,
1496 unsigned long addr, unsigned long end,
1497 struct zap_details *details)
1498{
1499 pgd_t *pgd;
1500 unsigned long next;
1501
1502 BUG_ON(addr >= end);
1503 tlb_start_vma(tlb, vma);
1504 pgd = pgd_offset(vma->vm_mm, addr);
1505 do {
1506 next = pgd_addr_end(addr, end);
1507 if (pgd_none_or_clear_bad(pgd))
1508 continue;
1509 next = zap_p4d_range(tlb, vma, pgd, addr, next, details);
1510 } while (pgd++, addr = next, addr != end);
1511 tlb_end_vma(tlb, vma);
1512}
1513
1514
1515static void unmap_single_vma(struct mmu_gather *tlb,
1516 struct vm_area_struct *vma, unsigned long start_addr,
1517 unsigned long end_addr,
1518 struct zap_details *details)
1519{
1520 unsigned long start = max(vma->vm_start, start_addr);
1521 unsigned long end;
1522
1523 if (start >= vma->vm_end)
1524 return;
1525 end = min(vma->vm_end, end_addr);
1526 if (end <= vma->vm_start)
1527 return;
1528
1529 if (vma->vm_file)
1530 uprobe_munmap(vma, start, end);
1531
1532 if (unlikely(vma->vm_flags & VM_PFNMAP))
1533 untrack_pfn(vma, 0, 0);
1534
1535 if (start != end) {
1536 if (unlikely(is_vm_hugetlb_page(vma))) {
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548 if (vma->vm_file) {
1549 i_mmap_lock_write(vma->vm_file->f_mapping);
1550 __unmap_hugepage_range_final(tlb, vma, start, end, NULL);
1551 i_mmap_unlock_write(vma->vm_file->f_mapping);
1552 }
1553 } else
1554 unmap_page_range(tlb, vma, start, end, details);
1555 }
1556}
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576void unmap_vmas(struct mmu_gather *tlb,
1577 struct vm_area_struct *vma, unsigned long start_addr,
1578 unsigned long end_addr)
1579{
1580 struct mm_struct *mm = vma->vm_mm;
1581
1582 mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
1583 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
1584 unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
1585 mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
1586}
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596void zap_page_range(struct vm_area_struct *vma, unsigned long start,
1597 unsigned long size)
1598{
1599 struct mm_struct *mm = vma->vm_mm;
1600 struct mmu_gather tlb;
1601 unsigned long end = start + size;
1602
1603 lru_add_drain();
1604 tlb_gather_mmu(&tlb, mm, start, end);
1605 update_hiwater_rss(mm);
1606 mmu_notifier_invalidate_range_start(mm, start, end);
1607 for ( ; vma && vma->vm_start < end; vma = vma->vm_next) {
1608 unmap_single_vma(&tlb, vma, start, end, NULL);
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618 flush_tlb_range(vma, start, end);
1619 }
1620
1621 mmu_notifier_invalidate_range_end(mm, start, end);
1622 tlb_finish_mmu(&tlb, start, end);
1623}
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
1635 unsigned long size, struct zap_details *details)
1636{
1637 struct mm_struct *mm = vma->vm_mm;
1638 struct mmu_gather tlb;
1639 unsigned long end = address + size;
1640
1641 lru_add_drain();
1642 tlb_gather_mmu(&tlb, mm, address, end);
1643 update_hiwater_rss(mm);
1644 mmu_notifier_invalidate_range_start(mm, address, end);
1645 unmap_single_vma(&tlb, vma, address, end, details);
1646 mmu_notifier_invalidate_range_end(mm, address, end);
1647 tlb_finish_mmu(&tlb, address, end);
1648}
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
1663 unsigned long size)
1664{
1665 if (address < vma->vm_start || address + size > vma->vm_end ||
1666 !(vma->vm_flags & VM_PFNMAP))
1667 return -1;
1668 zap_page_range_single(vma, address, size, NULL);
1669 return 0;
1670}
1671EXPORT_SYMBOL_GPL(zap_vma_ptes);
1672
1673pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
1674 spinlock_t **ptl)
1675{
1676 pgd_t *pgd;
1677 p4d_t *p4d;
1678 pud_t *pud;
1679 pmd_t *pmd;
1680
1681 pgd = pgd_offset(mm, addr);
1682 p4d = p4d_alloc(mm, pgd, addr);
1683 if (!p4d)
1684 return NULL;
1685 pud = pud_alloc(mm, p4d, addr);
1686 if (!pud)
1687 return NULL;
1688 pmd = pmd_alloc(mm, pud, addr);
1689 if (!pmd)
1690 return NULL;
1691
1692 VM_BUG_ON(pmd_trans_huge(*pmd));
1693 return pte_alloc_map_lock(mm, pmd, addr, ptl);
1694}
1695
1696
1697
1698
1699
1700
1701
1702
1703static int insert_page(struct vm_area_struct *vma, unsigned long addr,
1704 struct page *page, pgprot_t prot)
1705{
1706 struct mm_struct *mm = vma->vm_mm;
1707 int retval;
1708 pte_t *pte;
1709 spinlock_t *ptl;
1710
1711 retval = -EINVAL;
1712 if (PageAnon(page))
1713 goto out;
1714 retval = -ENOMEM;
1715 flush_dcache_page(page);
1716 pte = get_locked_pte(mm, addr, &ptl);
1717 if (!pte)
1718 goto out;
1719 retval = -EBUSY;
1720 if (!pte_none(*pte))
1721 goto out_unlock;
1722
1723
1724 get_page(page);
1725 inc_mm_counter_fast(mm, mm_counter_file(page));
1726 page_add_file_rmap(page, false);
1727 set_pte_at(mm, addr, pte, mk_pte(page, prot));
1728
1729 retval = 0;
1730 pte_unmap_unlock(pte, ptl);
1731 return retval;
1732out_unlock:
1733 pte_unmap_unlock(pte, ptl);
1734out:
1735 return retval;
1736}
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
1766 struct page *page)
1767{
1768 if (addr < vma->vm_start || addr >= vma->vm_end)
1769 return -EFAULT;
1770 if (!page_count(page))
1771 return -EINVAL;
1772 if (!(vma->vm_flags & VM_MIXEDMAP)) {
1773 BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem));
1774 BUG_ON(vma->vm_flags & VM_PFNMAP);
1775 vma->vm_flags |= VM_MIXEDMAP;
1776 }
1777 return insert_page(vma, addr, page, vma->vm_page_prot);
1778}
1779EXPORT_SYMBOL(vm_insert_page);
1780
1781static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1782 pfn_t pfn, pgprot_t prot, bool mkwrite)
1783{
1784 struct mm_struct *mm = vma->vm_mm;
1785 int retval;
1786 pte_t *pte, entry;
1787 spinlock_t *ptl;
1788
1789 retval = -ENOMEM;
1790 pte = get_locked_pte(mm, addr, &ptl);
1791 if (!pte)
1792 goto out;
1793 retval = -EBUSY;
1794 if (!pte_none(*pte)) {
1795 if (mkwrite) {
1796
1797
1798
1799
1800
1801
1802
1803 if (WARN_ON_ONCE(pte_pfn(*pte) != pfn_t_to_pfn(pfn)))
1804 goto out_unlock;
1805 entry = *pte;
1806 goto out_mkwrite;
1807 } else
1808 goto out_unlock;
1809 }
1810
1811
1812 if (pfn_t_devmap(pfn))
1813 entry = pte_mkdevmap(pfn_t_pte(pfn, prot));
1814 else
1815 entry = pte_mkspecial(pfn_t_pte(pfn, prot));
1816
1817out_mkwrite:
1818 if (mkwrite) {
1819 entry = pte_mkyoung(entry);
1820 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1821 }
1822
1823 set_pte_at(mm, addr, pte, entry);
1824 update_mmu_cache(vma, addr, pte);
1825
1826 retval = 0;
1827out_unlock:
1828 pte_unmap_unlock(pte, ptl);
1829out:
1830 return retval;
1831}
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1851 unsigned long pfn)
1852{
1853 return vm_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
1854}
1855EXPORT_SYMBOL(vm_insert_pfn);
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
1873 unsigned long pfn, pgprot_t pgprot)
1874{
1875 int ret;
1876
1877
1878
1879
1880
1881
1882 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
1883 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
1884 (VM_PFNMAP|VM_MIXEDMAP));
1885 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
1886 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
1887
1888 if (addr < vma->vm_start || addr >= vma->vm_end)
1889 return -EFAULT;
1890
1891 track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
1892
1893 ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
1894 false);
1895
1896 return ret;
1897}
1898EXPORT_SYMBOL(vm_insert_pfn_prot);
1899
1900static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
1901 pfn_t pfn, bool mkwrite)
1902{
1903 pgprot_t pgprot = vma->vm_page_prot;
1904
1905 BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
1906
1907 if (addr < vma->vm_start || addr >= vma->vm_end)
1908 return -EFAULT;
1909
1910 track_pfn_insert(vma, &pgprot, pfn);
1911
1912
1913
1914
1915
1916
1917
1918
1919 if (!HAVE_PTE_SPECIAL && !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) {
1920 struct page *page;
1921
1922
1923
1924
1925
1926
1927 page = pfn_to_page(pfn_t_to_pfn(pfn));
1928 return insert_page(vma, addr, page, pgprot);
1929 }
1930 return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
1931}
1932
1933int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
1934 pfn_t pfn)
1935{
1936 return __vm_insert_mixed(vma, addr, pfn, false);
1937
1938}
1939EXPORT_SYMBOL(vm_insert_mixed);
1940
1941int vm_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr,
1942 pfn_t pfn)
1943{
1944 return __vm_insert_mixed(vma, addr, pfn, true);
1945}
1946EXPORT_SYMBOL(vm_insert_mixed_mkwrite);
1947
1948
1949
1950
1951
1952
1953static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1954 unsigned long addr, unsigned long end,
1955 unsigned long pfn, pgprot_t prot)
1956{
1957 pte_t *pte;
1958 spinlock_t *ptl;
1959
1960 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
1961 if (!pte)
1962 return -ENOMEM;
1963 arch_enter_lazy_mmu_mode();
1964 do {
1965 BUG_ON(!pte_none(*pte));
1966 set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
1967 pfn++;
1968 } while (pte++, addr += PAGE_SIZE, addr != end);
1969 arch_leave_lazy_mmu_mode();
1970 pte_unmap_unlock(pte - 1, ptl);
1971 return 0;
1972}
1973
1974static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
1975 unsigned long addr, unsigned long end,
1976 unsigned long pfn, pgprot_t prot)
1977{
1978 pmd_t *pmd;
1979 unsigned long next;
1980
1981 pfn -= addr >> PAGE_SHIFT;
1982 pmd = pmd_alloc(mm, pud, addr);
1983 if (!pmd)
1984 return -ENOMEM;
1985 VM_BUG_ON(pmd_trans_huge(*pmd));
1986 do {
1987 next = pmd_addr_end(addr, end);
1988 if (remap_pte_range(mm, pmd, addr, next,
1989 pfn + (addr >> PAGE_SHIFT), prot))
1990 return -ENOMEM;
1991 } while (pmd++, addr = next, addr != end);
1992 return 0;
1993}
1994
1995static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d,
1996 unsigned long addr, unsigned long end,
1997 unsigned long pfn, pgprot_t prot)
1998{
1999 pud_t *pud;
2000 unsigned long next;
2001
2002 pfn -= addr >> PAGE_SHIFT;
2003 pud = pud_alloc(mm, p4d, addr);
2004 if (!pud)
2005 return -ENOMEM;
2006 do {
2007 next = pud_addr_end(addr, end);
2008 if (remap_pmd_range(mm, pud, addr, next,
2009 pfn + (addr >> PAGE_SHIFT), prot))
2010 return -ENOMEM;
2011 } while (pud++, addr = next, addr != end);
2012 return 0;
2013}
2014
2015static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
2016 unsigned long addr, unsigned long end,
2017 unsigned long pfn, pgprot_t prot)
2018{
2019 p4d_t *p4d;
2020 unsigned long next;
2021
2022 pfn -= addr >> PAGE_SHIFT;
2023 p4d = p4d_alloc(mm, pgd, addr);
2024 if (!p4d)
2025 return -ENOMEM;
2026 do {
2027 next = p4d_addr_end(addr, end);
2028 if (remap_pud_range(mm, p4d, addr, next,
2029 pfn + (addr >> PAGE_SHIFT), prot))
2030 return -ENOMEM;
2031 } while (p4d++, addr = next, addr != end);
2032 return 0;
2033}
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
2046 unsigned long pfn, unsigned long size, pgprot_t prot)
2047{
2048 pgd_t *pgd;
2049 unsigned long next;
2050 unsigned long end = addr + PAGE_ALIGN(size);
2051 struct mm_struct *mm = vma->vm_mm;
2052 unsigned long remap_pfn = pfn;
2053 int err;
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073 if (is_cow_mapping(vma->vm_flags)) {
2074 if (addr != vma->vm_start || end != vma->vm_end)
2075 return -EINVAL;
2076 vma->vm_pgoff = pfn;
2077 }
2078
2079 err = track_pfn_remap(vma, &prot, remap_pfn, addr, PAGE_ALIGN(size));
2080 if (err)
2081 return -EINVAL;
2082
2083 vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
2084
2085 BUG_ON(addr >= end);
2086 pfn -= addr >> PAGE_SHIFT;
2087 pgd = pgd_offset(mm, addr);
2088 flush_cache_range(vma, addr, end);
2089 do {
2090 next = pgd_addr_end(addr, end);
2091 err = remap_p4d_range(mm, pgd, addr, next,
2092 pfn + (addr >> PAGE_SHIFT), prot);
2093 if (err)
2094 break;
2095 } while (pgd++, addr = next, addr != end);
2096
2097 if (err)
2098 untrack_pfn(vma, remap_pfn, PAGE_ALIGN(size));
2099
2100 return err;
2101}
2102EXPORT_SYMBOL(remap_pfn_range);
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
2118{
2119 unsigned long vm_len, pfn, pages;
2120
2121
2122 if (start + len < start)
2123 return -EINVAL;
2124
2125
2126
2127
2128
2129 len += start & ~PAGE_MASK;
2130 pfn = start >> PAGE_SHIFT;
2131 pages = (len + ~PAGE_MASK) >> PAGE_SHIFT;
2132 if (pfn + pages < pfn)
2133 return -EINVAL;
2134
2135
2136 if (vma->vm_pgoff > pages)
2137 return -EINVAL;
2138 pfn += vma->vm_pgoff;
2139 pages -= vma->vm_pgoff;
2140
2141
2142 vm_len = vma->vm_end - vma->vm_start;
2143 if (vm_len >> PAGE_SHIFT > pages)
2144 return -EINVAL;
2145
2146
2147 return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
2148}
2149EXPORT_SYMBOL(vm_iomap_memory);
2150
2151static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
2152 unsigned long addr, unsigned long end,
2153 pte_fn_t fn, void *data)
2154{
2155 pte_t *pte;
2156 int err;
2157 pgtable_t token;
2158 spinlock_t *uninitialized_var(ptl);
2159
2160 pte = (mm == &init_mm) ?
2161 pte_alloc_kernel(pmd, addr) :
2162 pte_alloc_map_lock(mm, pmd, addr, &ptl);
2163 if (!pte)
2164 return -ENOMEM;
2165
2166 BUG_ON(pmd_huge(*pmd));
2167
2168 arch_enter_lazy_mmu_mode();
2169
2170 token = pmd_pgtable(*pmd);
2171
2172 do {
2173 err = fn(pte++, token, addr, data);
2174 if (err)
2175 break;
2176 } while (addr += PAGE_SIZE, addr != end);
2177
2178 arch_leave_lazy_mmu_mode();
2179
2180 if (mm != &init_mm)
2181 pte_unmap_unlock(pte-1, ptl);
2182 return err;
2183}
2184
2185static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
2186 unsigned long addr, unsigned long end,
2187 pte_fn_t fn, void *data)
2188{
2189 pmd_t *pmd;
2190 unsigned long next;
2191 int err;
2192
2193 BUG_ON(pud_huge(*pud));
2194
2195 pmd = pmd_alloc(mm, pud, addr);
2196 if (!pmd)
2197 return -ENOMEM;
2198 do {
2199 next = pmd_addr_end(addr, end);
2200 err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
2201 if (err)
2202 break;
2203 } while (pmd++, addr = next, addr != end);
2204 return err;
2205}
2206
2207static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
2208 unsigned long addr, unsigned long end,
2209 pte_fn_t fn, void *data)
2210{
2211 pud_t *pud;
2212 unsigned long next;
2213 int err;
2214
2215 pud = pud_alloc(mm, p4d, addr);
2216 if (!pud)
2217 return -ENOMEM;
2218 do {
2219 next = pud_addr_end(addr, end);
2220 err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
2221 if (err)
2222 break;
2223 } while (pud++, addr = next, addr != end);
2224 return err;
2225}
2226
2227static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
2228 unsigned long addr, unsigned long end,
2229 pte_fn_t fn, void *data)
2230{
2231 p4d_t *p4d;
2232 unsigned long next;
2233 int err;
2234
2235 p4d = p4d_alloc(mm, pgd, addr);
2236 if (!p4d)
2237 return -ENOMEM;
2238 do {
2239 next = p4d_addr_end(addr, end);
2240 err = apply_to_pud_range(mm, p4d, addr, next, fn, data);
2241 if (err)
2242 break;
2243 } while (p4d++, addr = next, addr != end);
2244 return err;
2245}
2246
2247
2248
2249
2250
2251int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
2252 unsigned long size, pte_fn_t fn, void *data)
2253{
2254 pgd_t *pgd;
2255 unsigned long next;
2256 unsigned long end = addr + size;
2257 int err;
2258
2259 if (WARN_ON(addr >= end))
2260 return -EINVAL;
2261
2262 pgd = pgd_offset(mm, addr);
2263 do {
2264 next = pgd_addr_end(addr, end);
2265 err = apply_to_p4d_range(mm, pgd, addr, next, fn, data);
2266 if (err)
2267 break;
2268 } while (pgd++, addr = next, addr != end);
2269
2270 return err;
2271}
2272EXPORT_SYMBOL_GPL(apply_to_page_range);
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
2283 pte_t *page_table, pte_t orig_pte)
2284{
2285 int same = 1;
2286#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
2287 if (sizeof(pte_t) > sizeof(unsigned long)) {
2288 spinlock_t *ptl = pte_lockptr(mm, pmd);
2289 spin_lock(ptl);
2290 same = pte_same(*page_table, orig_pte);
2291 spin_unlock(ptl);
2292 }
2293#endif
2294 pte_unmap(page_table);
2295 return same;
2296}
2297
2298static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
2299{
2300 debug_dma_assert_idle(src);
2301
2302
2303
2304
2305
2306
2307
2308 if (unlikely(!src)) {
2309 void *kaddr = kmap_atomic(dst);
2310 void __user *uaddr = (void __user *)(va & PAGE_MASK);
2311
2312
2313
2314
2315
2316
2317
2318 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
2319 clear_page(kaddr);
2320 kunmap_atomic(kaddr);
2321 flush_dcache_page(dst);
2322 } else
2323 copy_user_highpage(dst, src, va, vma);
2324}
2325
2326static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
2327{
2328 struct file *vm_file = vma->vm_file;
2329
2330 if (vm_file)
2331 return mapping_gfp_mask(vm_file->f_mapping) | __GFP_FS | __GFP_IO;
2332
2333
2334
2335
2336
2337 return GFP_KERNEL;
2338}
2339
2340
2341
2342
2343
2344
2345
2346static int do_page_mkwrite(struct vm_fault *vmf)
2347{
2348 int ret;
2349 struct page *page = vmf->page;
2350 unsigned int old_flags = vmf->flags;
2351
2352 vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
2353
2354 ret = vmf->vma->vm_ops->page_mkwrite(vmf);
2355
2356 vmf->flags = old_flags;
2357 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
2358 return ret;
2359 if (unlikely(!(ret & VM_FAULT_LOCKED))) {
2360 lock_page(page);
2361 if (!page->mapping) {
2362 unlock_page(page);
2363 return 0;
2364 }
2365 ret |= VM_FAULT_LOCKED;
2366 } else
2367 VM_BUG_ON_PAGE(!PageLocked(page), page);
2368 return ret;
2369}
2370
2371
2372
2373
2374
2375
2376static void fault_dirty_shared_page(struct vm_area_struct *vma,
2377 struct page *page)
2378{
2379 struct address_space *mapping;
2380 bool dirtied;
2381 bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;
2382
2383 dirtied = set_page_dirty(page);
2384 VM_BUG_ON_PAGE(PageAnon(page), page);
2385
2386
2387
2388
2389
2390
2391 mapping = page_rmapping(page);
2392 unlock_page(page);
2393
2394 if ((dirtied || page_mkwrite) && mapping) {
2395
2396
2397
2398
2399 balance_dirty_pages_ratelimited(mapping);
2400 }
2401
2402 if (!page_mkwrite)
2403 file_update_time(vma->vm_file);
2404}
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414static inline void wp_page_reuse(struct vm_fault *vmf)
2415 __releases(vmf->ptl)
2416{
2417 struct vm_area_struct *vma = vmf->vma;
2418 struct page *page = vmf->page;
2419 pte_t entry;
2420
2421
2422
2423
2424
2425 if (page)
2426 page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);
2427
2428 flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
2429 entry = pte_mkyoung(vmf->orig_pte);
2430 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2431 if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
2432 update_mmu_cache(vma, vmf->address, vmf->pte);
2433 pte_unmap_unlock(vmf->pte, vmf->ptl);
2434}
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452static int wp_page_copy(struct vm_fault *vmf)
2453{
2454 struct vm_area_struct *vma = vmf->vma;
2455 struct mm_struct *mm = vma->vm_mm;
2456 struct page *old_page = vmf->page;
2457 struct page *new_page = NULL;
2458 pte_t entry;
2459 int page_copied = 0;
2460 const unsigned long mmun_start = vmf->address & PAGE_MASK;
2461 const unsigned long mmun_end = mmun_start + PAGE_SIZE;
2462 struct mem_cgroup *memcg;
2463
2464 if (unlikely(anon_vma_prepare(vma)))
2465 goto oom;
2466
2467 if (is_zero_pfn(pte_pfn(vmf->orig_pte))) {
2468 new_page = alloc_zeroed_user_highpage_movable(vma,
2469 vmf->address);
2470 if (!new_page)
2471 goto oom;
2472 } else {
2473 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
2474 vmf->address);
2475 if (!new_page)
2476 goto oom;
2477 cow_user_page(new_page, old_page, vmf->address, vma);
2478 }
2479
2480 if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false))
2481 goto oom_free_new;
2482
2483 __SetPageUptodate(new_page);
2484
2485 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2486
2487
2488
2489
2490 vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
2491 if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
2492 if (old_page) {
2493 if (!PageAnon(old_page)) {
2494 dec_mm_counter_fast(mm,
2495 mm_counter_file(old_page));
2496 inc_mm_counter_fast(mm, MM_ANONPAGES);
2497 }
2498 } else {
2499 inc_mm_counter_fast(mm, MM_ANONPAGES);
2500 }
2501 flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
2502 entry = mk_pte(new_page, vma->vm_page_prot);
2503 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2504
2505
2506
2507
2508
2509
2510 ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
2511 page_add_new_anon_rmap(new_page, vma, vmf->address, false);
2512 mem_cgroup_commit_charge(new_page, memcg, false, false);
2513 lru_cache_add_active_or_unevictable(new_page, vma);
2514
2515
2516
2517
2518
2519 set_pte_at_notify(mm, vmf->address, vmf->pte, entry);
2520 update_mmu_cache(vma, vmf->address, vmf->pte);
2521 if (old_page) {
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544 page_remove_rmap(old_page, false);
2545 }
2546
2547
2548 new_page = old_page;
2549 page_copied = 1;
2550 } else {
2551 mem_cgroup_cancel_charge(new_page, memcg, false);
2552 }
2553
2554 if (new_page)
2555 put_page(new_page);
2556
2557 pte_unmap_unlock(vmf->pte, vmf->ptl);
2558
2559
2560
2561
2562 mmu_notifier_invalidate_range_only_end(mm, mmun_start, mmun_end);
2563 if (old_page) {
2564
2565
2566
2567
2568 if (page_copied && (vma->vm_flags & VM_LOCKED)) {
2569 lock_page(old_page);
2570 if (PageMlocked(old_page))
2571 munlock_vma_page(old_page);
2572 unlock_page(old_page);
2573 }
2574 put_page(old_page);
2575 }
2576 return page_copied ? VM_FAULT_WRITE : 0;
2577oom_free_new:
2578 put_page(new_page);
2579oom:
2580 if (old_page)
2581 put_page(old_page);
2582 return VM_FAULT_OOM;
2583}
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600int finish_mkwrite_fault(struct vm_fault *vmf)
2601{
2602 WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
2603 vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
2604 &vmf->ptl);
2605
2606
2607
2608
2609 if (!pte_same(*vmf->pte, vmf->orig_pte)) {
2610 pte_unmap_unlock(vmf->pte, vmf->ptl);
2611 return VM_FAULT_NOPAGE;
2612 }
2613 wp_page_reuse(vmf);
2614 return 0;
2615}
2616
2617
2618
2619
2620
2621static int wp_pfn_shared(struct vm_fault *vmf)
2622{
2623 struct vm_area_struct *vma = vmf->vma;
2624
2625 if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
2626 int ret;
2627
2628 pte_unmap_unlock(vmf->pte, vmf->ptl);
2629 vmf->flags |= FAULT_FLAG_MKWRITE;
2630 ret = vma->vm_ops->pfn_mkwrite(vmf);
2631 if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
2632 return ret;
2633 return finish_mkwrite_fault(vmf);
2634 }
2635 wp_page_reuse(vmf);
2636 return VM_FAULT_WRITE;
2637}
2638
2639static int wp_page_shared(struct vm_fault *vmf)
2640 __releases(vmf->ptl)
2641{
2642 struct vm_area_struct *vma = vmf->vma;
2643
2644 get_page(vmf->page);
2645
2646 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
2647 int tmp;
2648
2649 pte_unmap_unlock(vmf->pte, vmf->ptl);
2650 tmp = do_page_mkwrite(vmf);
2651 if (unlikely(!tmp || (tmp &
2652 (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
2653 put_page(vmf->page);
2654 return tmp;
2655 }
2656 tmp = finish_mkwrite_fault(vmf);
2657 if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
2658 unlock_page(vmf->page);
2659 put_page(vmf->page);
2660 return tmp;
2661 }
2662 } else {
2663 wp_page_reuse(vmf);
2664 lock_page(vmf->page);
2665 }
2666 fault_dirty_shared_page(vma, vmf->page);
2667 put_page(vmf->page);
2668
2669 return VM_FAULT_WRITE;
2670}
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690static int do_wp_page(struct vm_fault *vmf)
2691 __releases(vmf->ptl)
2692{
2693 struct vm_area_struct *vma = vmf->vma;
2694
2695 vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
2696 if (!vmf->page) {
2697
2698
2699
2700
2701
2702
2703
2704 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2705 (VM_WRITE|VM_SHARED))
2706 return wp_pfn_shared(vmf);
2707
2708 pte_unmap_unlock(vmf->pte, vmf->ptl);
2709 return wp_page_copy(vmf);
2710 }
2711
2712
2713
2714
2715
2716 if (PageAnon(vmf->page) && !PageKsm(vmf->page)) {
2717 int total_map_swapcount;
2718 if (!trylock_page(vmf->page)) {
2719 get_page(vmf->page);
2720 pte_unmap_unlock(vmf->pte, vmf->ptl);
2721 lock_page(vmf->page);
2722 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
2723 vmf->address, &vmf->ptl);
2724 if (!pte_same(*vmf->pte, vmf->orig_pte)) {
2725 unlock_page(vmf->page);
2726 pte_unmap_unlock(vmf->pte, vmf->ptl);
2727 put_page(vmf->page);
2728 return 0;
2729 }
2730 put_page(vmf->page);
2731 }
2732 if (reuse_swap_page(vmf->page, &total_map_swapcount)) {
2733 if (total_map_swapcount == 1) {
2734
2735
2736
2737
2738
2739
2740
2741 page_move_anon_rmap(vmf->page, vma);
2742 }
2743 unlock_page(vmf->page);
2744 wp_page_reuse(vmf);
2745 return VM_FAULT_WRITE;
2746 }
2747 unlock_page(vmf->page);
2748 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2749 (VM_WRITE|VM_SHARED))) {
2750 return wp_page_shared(vmf);
2751 }
2752
2753
2754
2755
2756 get_page(vmf->page);
2757
2758 pte_unmap_unlock(vmf->pte, vmf->ptl);
2759 return wp_page_copy(vmf);
2760}
2761
2762static void unmap_mapping_range_vma(struct vm_area_struct *vma,
2763 unsigned long start_addr, unsigned long end_addr,
2764 struct zap_details *details)
2765{
2766 zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
2767}
2768
2769static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
2770 struct zap_details *details)
2771{
2772 struct vm_area_struct *vma;
2773 pgoff_t vba, vea, zba, zea;
2774
2775 vma_interval_tree_foreach(vma, root,
2776 details->first_index, details->last_index) {
2777
2778 vba = vma->vm_pgoff;
2779 vea = vba + vma_pages(vma) - 1;
2780 zba = details->first_index;
2781 if (zba < vba)
2782 zba = vba;
2783 zea = details->last_index;
2784 if (zea > vea)
2785 zea = vea;
2786
2787 unmap_mapping_range_vma(vma,
2788 ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
2789 ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
2790 details);
2791 }
2792}
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811void unmap_mapping_range(struct address_space *mapping,
2812 loff_t const holebegin, loff_t const holelen, int even_cows)
2813{
2814 struct zap_details details = { };
2815 pgoff_t hba = holebegin >> PAGE_SHIFT;
2816 pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2817
2818
2819 if (sizeof(holelen) > sizeof(hlen)) {
2820 long long holeend =
2821 (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2822 if (holeend & ~(long long)ULONG_MAX)
2823 hlen = ULONG_MAX - hba + 1;
2824 }
2825
2826 details.check_mapping = even_cows ? NULL : mapping;
2827 details.first_index = hba;
2828 details.last_index = hba + hlen - 1;
2829 if (details.last_index < details.first_index)
2830 details.last_index = ULONG_MAX;
2831
2832 i_mmap_lock_write(mapping);
2833 if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
2834 unmap_mapping_range_tree(&mapping->i_mmap, &details);
2835 i_mmap_unlock_write(mapping);
2836}
2837EXPORT_SYMBOL(unmap_mapping_range);
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847int do_swap_page(struct vm_fault *vmf)
2848{
2849 struct vm_area_struct *vma = vmf->vma;
2850 struct page *page = NULL, *swapcache = NULL;
2851 struct mem_cgroup *memcg;
2852 struct vma_swap_readahead swap_ra;
2853 swp_entry_t entry;
2854 pte_t pte;
2855 int locked;
2856 int exclusive = 0;
2857 int ret = 0;
2858 bool vma_readahead = swap_use_vma_readahead();
2859
2860 if (vma_readahead) {
2861 page = swap_readahead_detect(vmf, &swap_ra);
2862 swapcache = page;
2863 }
2864
2865 if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) {
2866 if (page)
2867 put_page(page);
2868 goto out;
2869 }
2870
2871 entry = pte_to_swp_entry(vmf->orig_pte);
2872 if (unlikely(non_swap_entry(entry))) {
2873 if (is_migration_entry(entry)) {
2874 migration_entry_wait(vma->vm_mm, vmf->pmd,
2875 vmf->address);
2876 } else if (is_device_private_entry(entry)) {
2877
2878
2879
2880
2881
2882 ret = device_private_entry_fault(vma, vmf->address, entry,
2883 vmf->flags, vmf->pmd);
2884 } else if (is_hwpoison_entry(entry)) {
2885 ret = VM_FAULT_HWPOISON;
2886 } else {
2887 print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
2888 ret = VM_FAULT_SIGBUS;
2889 }
2890 goto out;
2891 }
2892
2893
2894 delayacct_set_flag(DELAYACCT_PF_SWAPIN);
2895 if (!page) {
2896 page = lookup_swap_cache(entry, vma_readahead ? vma : NULL,
2897 vmf->address);
2898 swapcache = page;
2899 }
2900
2901 if (!page) {
2902 struct swap_info_struct *si = swp_swap_info(entry);
2903
2904 if (si->flags & SWP_SYNCHRONOUS_IO &&
2905 __swap_count(si, entry) == 1) {
2906
2907 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address);
2908 if (page) {
2909 __SetPageLocked(page);
2910 __SetPageSwapBacked(page);
2911 set_page_private(page, entry.val);
2912 lru_cache_add_anon(page);
2913 swap_readpage(page, true);
2914 }
2915 } else {
2916 if (vma_readahead)
2917 page = do_swap_page_readahead(entry,
2918 GFP_HIGHUSER_MOVABLE, vmf, &swap_ra);
2919 else
2920 page = swapin_readahead(entry,
2921 GFP_HIGHUSER_MOVABLE, vma, vmf->address);
2922 swapcache = page;
2923 }
2924
2925 if (!page) {
2926
2927
2928
2929
2930 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
2931 vmf->address, &vmf->ptl);
2932 if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
2933 ret = VM_FAULT_OOM;
2934 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2935 goto unlock;
2936 }
2937
2938
2939 ret = VM_FAULT_MAJOR;
2940 count_vm_event(PGMAJFAULT);
2941 count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
2942 } else if (PageHWPoison(page)) {
2943
2944
2945
2946
2947 ret = VM_FAULT_HWPOISON;
2948 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2949 swapcache = page;
2950 goto out_release;
2951 }
2952
2953 locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
2954
2955 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2956 if (!locked) {
2957 ret |= VM_FAULT_RETRY;
2958 goto out_release;
2959 }
2960
2961
2962
2963
2964
2965
2966
2967 if (unlikely((!PageSwapCache(page) ||
2968 page_private(page) != entry.val)) && swapcache)
2969 goto out_page;
2970
2971 page = ksm_might_need_to_copy(page, vma, vmf->address);
2972 if (unlikely(!page)) {
2973 ret = VM_FAULT_OOM;
2974 page = swapcache;
2975 goto out_page;
2976 }
2977
2978 if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL,
2979 &memcg, false)) {
2980 ret = VM_FAULT_OOM;
2981 goto out_page;
2982 }
2983
2984
2985
2986
2987 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
2988 &vmf->ptl);
2989 if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte)))
2990 goto out_nomap;
2991
2992 if (unlikely(!PageUptodate(page))) {
2993 ret = VM_FAULT_SIGBUS;
2994 goto out_nomap;
2995 }
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
3008 dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
3009 pte = mk_pte(page, vma->vm_page_prot);
3010 if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
3011 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
3012 vmf->flags &= ~FAULT_FLAG_WRITE;
3013 ret |= VM_FAULT_WRITE;
3014 exclusive = RMAP_EXCLUSIVE;
3015 }
3016 flush_icache_page(vma, page);
3017 if (pte_swp_soft_dirty(vmf->orig_pte))
3018 pte = pte_mksoft_dirty(pte);
3019 set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
3020 vmf->orig_pte = pte;
3021
3022
3023 if (unlikely(page != swapcache && swapcache)) {
3024 page_add_new_anon_rmap(page, vma, vmf->address, false);
3025 mem_cgroup_commit_charge(page, memcg, false, false);
3026 lru_cache_add_active_or_unevictable(page, vma);
3027 } else {
3028 do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
3029 mem_cgroup_commit_charge(page, memcg, true, false);
3030 activate_page(page);
3031 }
3032
3033 swap_free(entry);
3034 if (mem_cgroup_swap_full(page) ||
3035 (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
3036 try_to_free_swap(page);
3037 unlock_page(page);
3038 if (page != swapcache && swapcache) {
3039
3040
3041
3042
3043
3044
3045
3046
3047 unlock_page(swapcache);
3048 put_page(swapcache);
3049 }
3050
3051 if (vmf->flags & FAULT_FLAG_WRITE) {
3052 ret |= do_wp_page(vmf);
3053 if (ret & VM_FAULT_ERROR)
3054 ret &= VM_FAULT_ERROR;
3055 goto out;
3056 }
3057
3058
3059 update_mmu_cache(vma, vmf->address, vmf->pte);
3060unlock:
3061 pte_unmap_unlock(vmf->pte, vmf->ptl);
3062out:
3063 return ret;
3064out_nomap:
3065 mem_cgroup_cancel_charge(page, memcg, false);
3066 pte_unmap_unlock(vmf->pte, vmf->ptl);
3067out_page:
3068 unlock_page(page);
3069out_release:
3070 put_page(page);
3071 if (page != swapcache && swapcache) {
3072 unlock_page(swapcache);
3073 put_page(swapcache);
3074 }
3075 return ret;
3076}
3077
3078
3079
3080
3081
3082
3083static int do_anonymous_page(struct vm_fault *vmf)
3084{
3085 struct vm_area_struct *vma = vmf->vma;
3086 struct mem_cgroup *memcg;
3087 struct page *page;
3088 int ret = 0;
3089 pte_t entry;
3090
3091
3092 if (vma->vm_flags & VM_SHARED)
3093 return VM_FAULT_SIGBUS;
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105 if (pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))
3106 return VM_FAULT_OOM;
3107
3108
3109 if (unlikely(pmd_trans_unstable(vmf->pmd)))
3110 return 0;
3111
3112
3113 if (!(vmf->flags & FAULT_FLAG_WRITE) &&
3114 !mm_forbids_zeropage(vma->vm_mm)) {
3115 entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address),
3116 vma->vm_page_prot));
3117 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
3118 vmf->address, &vmf->ptl);
3119 if (!pte_none(*vmf->pte))
3120 goto unlock;
3121 ret = check_stable_address_space(vma->vm_mm);
3122 if (ret)
3123 goto unlock;
3124
3125 if (userfaultfd_missing(vma)) {
3126 pte_unmap_unlock(vmf->pte, vmf->ptl);
3127 return handle_userfault(vmf, VM_UFFD_MISSING);
3128 }
3129 goto setpte;
3130 }
3131
3132
3133 if (unlikely(anon_vma_prepare(vma)))
3134 goto oom;
3135 page = alloc_zeroed_user_highpage_movable(vma, vmf->address);
3136 if (!page)
3137 goto oom;
3138
3139 if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false))
3140 goto oom_free_page;
3141
3142
3143
3144
3145
3146
3147 __SetPageUptodate(page);
3148
3149 entry = mk_pte(page, vma->vm_page_prot);
3150 if (vma->vm_flags & VM_WRITE)
3151 entry = pte_mkwrite(pte_mkdirty(entry));
3152
3153 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
3154 &vmf->ptl);
3155 if (!pte_none(*vmf->pte))
3156 goto release;
3157
3158 ret = check_stable_address_space(vma->vm_mm);
3159 if (ret)
3160 goto release;
3161
3162
3163 if (userfaultfd_missing(vma)) {
3164 pte_unmap_unlock(vmf->pte, vmf->ptl);
3165 mem_cgroup_cancel_charge(page, memcg, false);
3166 put_page(page);
3167 return handle_userfault(vmf, VM_UFFD_MISSING);
3168 }
3169
3170 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
3171 page_add_new_anon_rmap(page, vma, vmf->address, false);
3172 mem_cgroup_commit_charge(page, memcg, false, false);
3173 lru_cache_add_active_or_unevictable(page, vma);
3174setpte:
3175 set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
3176
3177
3178 update_mmu_cache(vma, vmf->address, vmf->pte);
3179unlock:
3180 pte_unmap_unlock(vmf->pte, vmf->ptl);
3181 return ret;
3182release:
3183 mem_cgroup_cancel_charge(page, memcg, false);
3184 put_page(page);
3185 goto unlock;
3186oom_free_page:
3187 put_page(page);
3188oom:
3189 return VM_FAULT_OOM;
3190}
3191
3192
3193
3194
3195
3196
3197static int __do_fault(struct vm_fault *vmf)
3198{
3199 struct vm_area_struct *vma = vmf->vma;
3200 int ret;
3201
3202 ret = vma->vm_ops->fault(vmf);
3203 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
3204 VM_FAULT_DONE_COW)))
3205 return ret;
3206
3207 if (unlikely(PageHWPoison(vmf->page))) {
3208 if (ret & VM_FAULT_LOCKED)
3209 unlock_page(vmf->page);
3210 put_page(vmf->page);
3211 vmf->page = NULL;
3212 return VM_FAULT_HWPOISON;
3213 }
3214
3215 if (unlikely(!(ret & VM_FAULT_LOCKED)))
3216 lock_page(vmf->page);
3217 else
3218 VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page);
3219
3220 return ret;
3221}
3222
3223
3224
3225
3226
3227
3228
3229static int pmd_devmap_trans_unstable(pmd_t *pmd)
3230{
3231 return pmd_devmap(*pmd) || pmd_trans_unstable(pmd);
3232}
3233
3234static int pte_alloc_one_map(struct vm_fault *vmf)
3235{
3236 struct vm_area_struct *vma = vmf->vma;
3237
3238 if (!pmd_none(*vmf->pmd))
3239 goto map_pte;
3240 if (vmf->prealloc_pte) {
3241 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
3242 if (unlikely(!pmd_none(*vmf->pmd))) {
3243 spin_unlock(vmf->ptl);
3244 goto map_pte;
3245 }
3246
3247 mm_inc_nr_ptes(vma->vm_mm);
3248 pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
3249 spin_unlock(vmf->ptl);
3250 vmf->prealloc_pte = NULL;
3251 } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))) {
3252 return VM_FAULT_OOM;
3253 }
3254map_pte:
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266 if (pmd_devmap_trans_unstable(vmf->pmd))
3267 return VM_FAULT_NOPAGE;
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
3279 &vmf->ptl);
3280 return 0;
3281}
3282
3283#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
3284
3285#define HPAGE_CACHE_INDEX_MASK (HPAGE_PMD_NR - 1)
3286static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
3287 unsigned long haddr)
3288{
3289 if (((vma->vm_start >> PAGE_SHIFT) & HPAGE_CACHE_INDEX_MASK) !=
3290 (vma->vm_pgoff & HPAGE_CACHE_INDEX_MASK))
3291 return false;
3292 if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
3293 return false;
3294 return true;
3295}
3296
3297static void deposit_prealloc_pte(struct vm_fault *vmf)
3298{
3299 struct vm_area_struct *vma = vmf->vma;
3300
3301 pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
3302
3303
3304
3305
3306 mm_inc_nr_ptes(vma->vm_mm);
3307 vmf->prealloc_pte = NULL;
3308}
3309
3310static int do_set_pmd(struct vm_fault *vmf, struct page *page)
3311{
3312 struct vm_area_struct *vma = vmf->vma;
3313 bool write = vmf->flags & FAULT_FLAG_WRITE;
3314 unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
3315 pmd_t entry;
3316 int i, ret;
3317
3318 if (!transhuge_vma_suitable(vma, haddr))
3319 return VM_FAULT_FALLBACK;
3320
3321 ret = VM_FAULT_FALLBACK;
3322 page = compound_head(page);
3323
3324
3325
3326
3327
3328 if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
3329 vmf->prealloc_pte = pte_alloc_one(vma->vm_mm, vmf->address);
3330 if (!vmf->prealloc_pte)
3331 return VM_FAULT_OOM;
3332 smp_wmb();
3333 }
3334
3335 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
3336 if (unlikely(!pmd_none(*vmf->pmd)))
3337 goto out;
3338
3339 for (i = 0; i < HPAGE_PMD_NR; i++)
3340 flush_icache_page(vma, page + i);
3341
3342 entry = mk_huge_pmd(page, vma->vm_page_prot);
3343 if (write)
3344 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
3345
3346 add_mm_counter(vma->vm_mm, MM_FILEPAGES, HPAGE_PMD_NR);
3347 page_add_file_rmap(page, true);
3348
3349
3350
3351 if (arch_needs_pgtable_deposit())
3352 deposit_prealloc_pte(vmf);
3353
3354 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
3355
3356 update_mmu_cache_pmd(vma, haddr, vmf->pmd);
3357
3358
3359 ret = 0;
3360 count_vm_event(THP_FILE_MAPPED);
3361out:
3362 spin_unlock(vmf->ptl);
3363 return ret;
3364}
3365#else
3366static int do_set_pmd(struct vm_fault *vmf, struct page *page)
3367{
3368 BUILD_BUG();
3369 return 0;
3370}
3371#endif
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
3388 struct page *page)
3389{
3390 struct vm_area_struct *vma = vmf->vma;
3391 bool write = vmf->flags & FAULT_FLAG_WRITE;
3392 pte_t entry;
3393 int ret;
3394
3395 if (pmd_none(*vmf->pmd) && PageTransCompound(page) &&
3396 IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) {
3397
3398 VM_BUG_ON_PAGE(memcg, page);
3399
3400 ret = do_set_pmd(vmf, page);
3401 if (ret != VM_FAULT_FALLBACK)
3402 return ret;
3403 }
3404
3405 if (!vmf->pte) {
3406 ret = pte_alloc_one_map(vmf);
3407 if (ret)
3408 return ret;
3409 }
3410
3411
3412 if (unlikely(!pte_none(*vmf->pte)))
3413 return VM_FAULT_NOPAGE;
3414
3415 flush_icache_page(vma, page);
3416 entry = mk_pte(page, vma->vm_page_prot);
3417 if (write)
3418 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
3419
3420 if (write && !(vma->vm_flags & VM_SHARED)) {
3421 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
3422 page_add_new_anon_rmap(page, vma, vmf->address, false);
3423 mem_cgroup_commit_charge(page, memcg, false, false);
3424 lru_cache_add_active_or_unevictable(page, vma);
3425 } else {
3426 inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
3427 page_add_file_rmap(page, false);
3428 }
3429 set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
3430
3431
3432 update_mmu_cache(vma, vmf->address, vmf->pte);
3433
3434 return 0;
3435}
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452int finish_fault(struct vm_fault *vmf)
3453{
3454 struct page *page;
3455 int ret = 0;
3456
3457
3458 if ((vmf->flags & FAULT_FLAG_WRITE) &&
3459 !(vmf->vma->vm_flags & VM_SHARED))
3460 page = vmf->cow_page;
3461 else
3462 page = vmf->page;
3463
3464
3465
3466
3467
3468 if (!(vmf->vma->vm_flags & VM_SHARED))
3469 ret = check_stable_address_space(vmf->vma->vm_mm);
3470 if (!ret)
3471 ret = alloc_set_pte(vmf, vmf->memcg, page);
3472 if (vmf->pte)
3473 pte_unmap_unlock(vmf->pte, vmf->ptl);
3474 return ret;
3475}
3476
3477static unsigned long fault_around_bytes __read_mostly =
3478 rounddown_pow_of_two(65536);
3479
3480#ifdef CONFIG_DEBUG_FS
3481static int fault_around_bytes_get(void *data, u64 *val)
3482{
3483 *val = fault_around_bytes;
3484 return 0;
3485}
3486
3487
3488
3489
3490
3491
3492static int fault_around_bytes_set(void *data, u64 val)
3493{
3494 if (val / PAGE_SIZE > PTRS_PER_PTE)
3495 return -EINVAL;
3496 if (val > PAGE_SIZE)
3497 fault_around_bytes = rounddown_pow_of_two(val);
3498 else
3499 fault_around_bytes = PAGE_SIZE;
3500 return 0;
3501}
3502DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops,
3503 fault_around_bytes_get, fault_around_bytes_set, "%llu\n");
3504
3505static int __init fault_around_debugfs(void)
3506{
3507 void *ret;
3508
3509 ret = debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL,
3510 &fault_around_bytes_fops);
3511 if (!ret)
3512 pr_warn("Failed to create fault_around_bytes in debugfs");
3513 return 0;
3514}
3515late_initcall(fault_around_debugfs);
3516#endif
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541static int do_fault_around(struct vm_fault *vmf)
3542{
3543 unsigned long address = vmf->address, nr_pages, mask;
3544 pgoff_t start_pgoff = vmf->pgoff;
3545 pgoff_t end_pgoff;
3546 int off, ret = 0;
3547
3548 nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
3549 mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
3550
3551 vmf->address = max(address & mask, vmf->vma->vm_start);
3552 off = ((address - vmf->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
3553 start_pgoff -= off;
3554
3555
3556
3557
3558
3559 end_pgoff = start_pgoff -
3560 ((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
3561 PTRS_PER_PTE - 1;
3562 end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1,
3563 start_pgoff + nr_pages - 1);
3564
3565 if (pmd_none(*vmf->pmd)) {
3566 vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm,
3567 vmf->address);
3568 if (!vmf->prealloc_pte)
3569 goto out;
3570 smp_wmb();
3571 }
3572
3573 vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);
3574
3575
3576 if (pmd_trans_huge(*vmf->pmd)) {
3577 ret = VM_FAULT_NOPAGE;
3578 goto out;
3579 }
3580
3581
3582 if (!vmf->pte)
3583 goto out;
3584
3585
3586 vmf->pte -= (vmf->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT);
3587 if (!pte_none(*vmf->pte))
3588 ret = VM_FAULT_NOPAGE;
3589 pte_unmap_unlock(vmf->pte, vmf->ptl);
3590out:
3591 vmf->address = address;
3592 vmf->pte = NULL;
3593 return ret;
3594}
3595
3596static int do_read_fault(struct vm_fault *vmf)
3597{
3598 struct vm_area_struct *vma = vmf->vma;
3599 int ret = 0;
3600
3601
3602
3603
3604
3605
3606 if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
3607 ret = do_fault_around(vmf);
3608 if (ret)
3609 return ret;
3610 }
3611
3612 ret = __do_fault(vmf);
3613 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3614 return ret;
3615
3616 ret |= finish_fault(vmf);
3617 unlock_page(vmf->page);
3618 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3619 put_page(vmf->page);
3620 return ret;
3621}
3622
3623static int do_cow_fault(struct vm_fault *vmf)
3624{
3625 struct vm_area_struct *vma = vmf->vma;
3626 int ret;
3627
3628 if (unlikely(anon_vma_prepare(vma)))
3629 return VM_FAULT_OOM;
3630
3631 vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address);
3632 if (!vmf->cow_page)
3633 return VM_FAULT_OOM;
3634
3635 if (mem_cgroup_try_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL,
3636 &vmf->memcg, false)) {
3637 put_page(vmf->cow_page);
3638 return VM_FAULT_OOM;
3639 }
3640
3641 ret = __do_fault(vmf);
3642 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3643 goto uncharge_out;
3644 if (ret & VM_FAULT_DONE_COW)
3645 return ret;
3646
3647 copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma);
3648 __SetPageUptodate(vmf->cow_page);
3649
3650 ret |= finish_fault(vmf);
3651 unlock_page(vmf->page);
3652 put_page(vmf->page);
3653 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3654 goto uncharge_out;
3655 return ret;
3656uncharge_out:
3657 mem_cgroup_cancel_charge(vmf->cow_page, vmf->memcg, false);
3658 put_page(vmf->cow_page);
3659 return ret;
3660}
3661
3662static int do_shared_fault(struct vm_fault *vmf)
3663{
3664 struct vm_area_struct *vma = vmf->vma;
3665 int ret, tmp;
3666
3667 ret = __do_fault(vmf);
3668 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3669 return ret;
3670
3671
3672
3673
3674
3675 if (vma->vm_ops->page_mkwrite) {
3676 unlock_page(vmf->page);
3677 tmp = do_page_mkwrite(vmf);
3678 if (unlikely(!tmp ||
3679 (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
3680 put_page(vmf->page);
3681 return tmp;
3682 }
3683 }
3684
3685 ret |= finish_fault(vmf);
3686 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
3687 VM_FAULT_RETRY))) {
3688 unlock_page(vmf->page);
3689 put_page(vmf->page);
3690 return ret;
3691 }
3692
3693 fault_dirty_shared_page(vma, vmf->page);
3694 return ret;
3695}
3696
3697
3698
3699
3700
3701
3702
3703static int do_fault(struct vm_fault *vmf)
3704{
3705 struct vm_area_struct *vma = vmf->vma;
3706 int ret;
3707
3708
3709 if (!vma->vm_ops->fault)
3710 ret = VM_FAULT_SIGBUS;
3711 else if (!(vmf->flags & FAULT_FLAG_WRITE))
3712 ret = do_read_fault(vmf);
3713 else if (!(vma->vm_flags & VM_SHARED))
3714 ret = do_cow_fault(vmf);
3715 else
3716 ret = do_shared_fault(vmf);
3717
3718
3719 if (vmf->prealloc_pte) {
3720 pte_free(vma->vm_mm, vmf->prealloc_pte);
3721 vmf->prealloc_pte = NULL;
3722 }
3723 return ret;
3724}
3725
3726static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
3727 unsigned long addr, int page_nid,
3728 int *flags)
3729{
3730 get_page(page);
3731
3732 count_vm_numa_event(NUMA_HINT_FAULTS);
3733 if (page_nid == numa_node_id()) {
3734 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
3735 *flags |= TNF_FAULT_LOCAL;
3736 }
3737
3738 return mpol_misplaced(page, vma, addr);
3739}
3740
3741static int do_numa_page(struct vm_fault *vmf)
3742{
3743 struct vm_area_struct *vma = vmf->vma;
3744 struct page *page = NULL;
3745 int page_nid = -1;
3746 int last_cpupid;
3747 int target_nid;
3748 bool migrated = false;
3749 pte_t pte;
3750 bool was_writable = pte_savedwrite(vmf->orig_pte);
3751 int flags = 0;
3752
3753
3754
3755
3756
3757
3758 vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd);
3759 spin_lock(vmf->ptl);
3760 if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
3761 pte_unmap_unlock(vmf->pte, vmf->ptl);
3762 goto out;
3763 }
3764
3765
3766
3767
3768
3769 pte = ptep_modify_prot_start(vma->vm_mm, vmf->address, vmf->pte);
3770 pte = pte_modify(pte, vma->vm_page_prot);
3771 pte = pte_mkyoung(pte);
3772 if (was_writable)
3773 pte = pte_mkwrite(pte);
3774 ptep_modify_prot_commit(vma->vm_mm, vmf->address, vmf->pte, pte);
3775 update_mmu_cache(vma, vmf->address, vmf->pte);
3776
3777 page = vm_normal_page(vma, vmf->address, pte);
3778 if (!page) {
3779 pte_unmap_unlock(vmf->pte, vmf->ptl);
3780 return 0;
3781 }
3782
3783
3784 if (PageCompound(page)) {
3785 pte_unmap_unlock(vmf->pte, vmf->ptl);
3786 return 0;
3787 }
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797 if (!pte_write(pte))
3798 flags |= TNF_NO_GROUP;
3799
3800
3801
3802
3803
3804 if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
3805 flags |= TNF_SHARED;
3806
3807 last_cpupid = page_cpupid_last(page);
3808 page_nid = page_to_nid(page);
3809 target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
3810 &flags);
3811 pte_unmap_unlock(vmf->pte, vmf->ptl);
3812 if (target_nid == -1) {
3813 put_page(page);
3814 goto out;
3815 }
3816
3817
3818 migrated = migrate_misplaced_page(page, vma, target_nid);
3819 if (migrated) {
3820 page_nid = target_nid;
3821 flags |= TNF_MIGRATED;
3822 } else
3823 flags |= TNF_MIGRATE_FAIL;
3824
3825out:
3826 if (page_nid != -1)
3827 task_numa_fault(last_cpupid, page_nid, 1, flags);
3828 return 0;
3829}
3830
3831static inline int create_huge_pmd(struct vm_fault *vmf)
3832{
3833 if (vma_is_anonymous(vmf->vma))
3834 return do_huge_pmd_anonymous_page(vmf);
3835 if (vmf->vma->vm_ops->huge_fault)
3836 return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
3837 return VM_FAULT_FALLBACK;
3838}
3839
3840
3841static inline int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
3842{
3843 if (vma_is_anonymous(vmf->vma))
3844 return do_huge_pmd_wp_page(vmf, orig_pmd);
3845 if (vmf->vma->vm_ops->huge_fault)
3846 return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
3847
3848
3849 VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);
3850 __split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);
3851
3852 return VM_FAULT_FALLBACK;
3853}
3854
3855static inline bool vma_is_accessible(struct vm_area_struct *vma)
3856{
3857 return vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE);
3858}
3859
3860static int create_huge_pud(struct vm_fault *vmf)
3861{
3862#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3863
3864 if (vma_is_anonymous(vmf->vma))
3865 return VM_FAULT_FALLBACK;
3866 if (vmf->vma->vm_ops->huge_fault)
3867 return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
3868#endif
3869 return VM_FAULT_FALLBACK;
3870}
3871
3872static int wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
3873{
3874#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3875
3876 if (vma_is_anonymous(vmf->vma))
3877 return VM_FAULT_FALLBACK;
3878 if (vmf->vma->vm_ops->huge_fault)
3879 return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
3880#endif
3881 return VM_FAULT_FALLBACK;
3882}
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899static int handle_pte_fault(struct vm_fault *vmf)
3900{
3901 pte_t entry;
3902
3903 if (unlikely(pmd_none(*vmf->pmd))) {
3904
3905
3906
3907
3908
3909
3910 vmf->pte = NULL;
3911 } else {
3912
3913 if (pmd_devmap_trans_unstable(vmf->pmd))
3914 return 0;
3915
3916
3917
3918
3919
3920
3921 vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
3922 vmf->orig_pte = *vmf->pte;
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932 barrier();
3933 if (pte_none(vmf->orig_pte)) {
3934 pte_unmap(vmf->pte);
3935 vmf->pte = NULL;
3936 }
3937 }
3938
3939 if (!vmf->pte) {
3940 if (vma_is_anonymous(vmf->vma))
3941 return do_anonymous_page(vmf);
3942 else
3943 return do_fault(vmf);
3944 }
3945
3946 if (!pte_present(vmf->orig_pte))
3947 return do_swap_page(vmf);
3948
3949 if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
3950 return do_numa_page(vmf);
3951
3952 vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
3953 spin_lock(vmf->ptl);
3954 entry = vmf->orig_pte;
3955 if (unlikely(!pte_same(*vmf->pte, entry)))
3956 goto unlock;
3957 if (vmf->flags & FAULT_FLAG_WRITE) {
3958 if (!pte_write(entry))
3959 return do_wp_page(vmf);
3960 entry = pte_mkdirty(entry);
3961 }
3962 entry = pte_mkyoung(entry);
3963 if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
3964 vmf->flags & FAULT_FLAG_WRITE)) {
3965 update_mmu_cache(vmf->vma, vmf->address, vmf->pte);
3966 } else {
3967
3968
3969
3970
3971
3972
3973 if (vmf->flags & FAULT_FLAG_WRITE)
3974 flush_tlb_fix_spurious_fault(vmf->vma, vmf->address);
3975 }
3976unlock:
3977 pte_unmap_unlock(vmf->pte, vmf->ptl);
3978 return 0;
3979}
3980
3981
3982
3983
3984
3985
3986
3987static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
3988 unsigned int flags)
3989{
3990 struct vm_fault vmf = {
3991 .vma = vma,
3992 .address = address & PAGE_MASK,
3993 .flags = flags,
3994 .pgoff = linear_page_index(vma, address),
3995 .gfp_mask = __get_fault_gfp_mask(vma),
3996 };
3997 unsigned int dirty = flags & FAULT_FLAG_WRITE;
3998 struct mm_struct *mm = vma->vm_mm;
3999 pgd_t *pgd;
4000 p4d_t *p4d;
4001 int ret;
4002
4003 pgd = pgd_offset(mm, address);
4004 p4d = p4d_alloc(mm, pgd, address);
4005 if (!p4d)
4006 return VM_FAULT_OOM;
4007
4008 vmf.pud = pud_alloc(mm, p4d, address);
4009 if (!vmf.pud)
4010 return VM_FAULT_OOM;
4011 if (pud_none(*vmf.pud) && transparent_hugepage_enabled(vma)) {
4012 ret = create_huge_pud(&vmf);
4013 if (!(ret & VM_FAULT_FALLBACK))
4014 return ret;
4015 } else {
4016 pud_t orig_pud = *vmf.pud;
4017
4018 barrier();
4019 if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {
4020
4021
4022
4023 if (dirty && !pud_write(orig_pud)) {
4024 ret = wp_huge_pud(&vmf, orig_pud);
4025 if (!(ret & VM_FAULT_FALLBACK))
4026 return ret;
4027 } else {
4028 huge_pud_set_accessed(&vmf, orig_pud);
4029 return 0;
4030 }
4031 }
4032 }
4033
4034 vmf.pmd = pmd_alloc(mm, vmf.pud, address);
4035 if (!vmf.pmd)
4036 return VM_FAULT_OOM;
4037 if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) {
4038 ret = create_huge_pmd(&vmf);
4039 if (!(ret & VM_FAULT_FALLBACK))
4040 return ret;
4041 } else {
4042 pmd_t orig_pmd = *vmf.pmd;
4043
4044 barrier();
4045 if (unlikely(is_swap_pmd(orig_pmd))) {
4046 VM_BUG_ON(thp_migration_supported() &&
4047 !is_pmd_migration_entry(orig_pmd));
4048 if (is_pmd_migration_entry(orig_pmd))
4049 pmd_migration_entry_wait(mm, vmf.pmd);
4050 return 0;
4051 }
4052 if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
4053 if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
4054 return do_huge_pmd_numa_page(&vmf, orig_pmd);
4055
4056 if (dirty && !pmd_write(orig_pmd)) {
4057 ret = wp_huge_pmd(&vmf, orig_pmd);
4058 if (!(ret & VM_FAULT_FALLBACK))
4059 return ret;
4060 } else {
4061 huge_pmd_set_accessed(&vmf, orig_pmd);
4062 return 0;
4063 }
4064 }
4065 }
4066
4067 return handle_pte_fault(&vmf);
4068}
4069
4070
4071
4072
4073
4074
4075
4076int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
4077 unsigned int flags)
4078{
4079 int ret;
4080
4081 __set_current_state(TASK_RUNNING);
4082
4083 count_vm_event(PGFAULT);
4084 count_memcg_event_mm(vma->vm_mm, PGFAULT);
4085
4086
4087 check_sync_rss_stat(current);
4088
4089 if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
4090 flags & FAULT_FLAG_INSTRUCTION,
4091 flags & FAULT_FLAG_REMOTE))
4092 return VM_FAULT_SIGSEGV;
4093
4094
4095
4096
4097
4098 if (flags & FAULT_FLAG_USER)
4099 mem_cgroup_oom_enable();
4100
4101 if (unlikely(is_vm_hugetlb_page(vma)))
4102 ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
4103 else
4104 ret = __handle_mm_fault(vma, address, flags);
4105
4106 if (flags & FAULT_FLAG_USER) {
4107 mem_cgroup_oom_disable();
4108
4109
4110
4111
4112
4113
4114 if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
4115 mem_cgroup_oom_synchronize(false);
4116 }
4117
4118 return ret;
4119}
4120EXPORT_SYMBOL_GPL(handle_mm_fault);
4121
4122#ifndef __PAGETABLE_P4D_FOLDED
4123
4124
4125
4126
4127int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
4128{
4129 p4d_t *new = p4d_alloc_one(mm, address);
4130 if (!new)
4131 return -ENOMEM;
4132
4133 smp_wmb();
4134
4135 spin_lock(&mm->page_table_lock);
4136 if (pgd_present(*pgd))
4137 p4d_free(mm, new);
4138 else
4139 pgd_populate(mm, pgd, new);
4140 spin_unlock(&mm->page_table_lock);
4141 return 0;
4142}
4143#endif
4144
4145#ifndef __PAGETABLE_PUD_FOLDED
4146
4147
4148
4149
4150int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address)
4151{
4152 pud_t *new = pud_alloc_one(mm, address);
4153 if (!new)
4154 return -ENOMEM;
4155
4156 smp_wmb();
4157
4158 spin_lock(&mm->page_table_lock);
4159#ifndef __ARCH_HAS_5LEVEL_HACK
4160 if (!p4d_present(*p4d)) {
4161 mm_inc_nr_puds(mm);
4162 p4d_populate(mm, p4d, new);
4163 } else
4164 pud_free(mm, new);
4165#else
4166 if (!pgd_present(*p4d)) {
4167 mm_inc_nr_puds(mm);
4168 pgd_populate(mm, p4d, new);
4169 } else
4170 pud_free(mm, new);
4171#endif
4172 spin_unlock(&mm->page_table_lock);
4173 return 0;
4174}
4175#endif
4176
4177#ifndef __PAGETABLE_PMD_FOLDED
4178
4179
4180
4181
4182int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
4183{
4184 spinlock_t *ptl;
4185 pmd_t *new = pmd_alloc_one(mm, address);
4186 if (!new)
4187 return -ENOMEM;
4188
4189 smp_wmb();
4190
4191 ptl = pud_lock(mm, pud);
4192#ifndef __ARCH_HAS_4LEVEL_HACK
4193 if (!pud_present(*pud)) {
4194 mm_inc_nr_pmds(mm);
4195 pud_populate(mm, pud, new);
4196 } else
4197 pmd_free(mm, new);
4198#else
4199 if (!pgd_present(*pud)) {
4200 mm_inc_nr_pmds(mm);
4201 pgd_populate(mm, pud, new);
4202 } else
4203 pmd_free(mm, new);
4204#endif
4205 spin_unlock(ptl);
4206 return 0;
4207}
4208#endif
4209
4210static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
4211 unsigned long *start, unsigned long *end,
4212 pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
4213{
4214 pgd_t *pgd;
4215 p4d_t *p4d;
4216 pud_t *pud;
4217 pmd_t *pmd;
4218 pte_t *ptep;
4219
4220 pgd = pgd_offset(mm, address);
4221 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
4222 goto out;
4223
4224 p4d = p4d_offset(pgd, address);
4225 if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d)))
4226 goto out;
4227
4228 pud = pud_offset(p4d, address);
4229 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
4230 goto out;
4231
4232 pmd = pmd_offset(pud, address);
4233 VM_BUG_ON(pmd_trans_huge(*pmd));
4234
4235 if (pmd_huge(*pmd)) {
4236 if (!pmdpp)
4237 goto out;
4238
4239 if (start && end) {
4240 *start = address & PMD_MASK;
4241 *end = *start + PMD_SIZE;
4242 mmu_notifier_invalidate_range_start(mm, *start, *end);
4243 }
4244 *ptlp = pmd_lock(mm, pmd);
4245 if (pmd_huge(*pmd)) {
4246 *pmdpp = pmd;
4247 return 0;
4248 }
4249 spin_unlock(*ptlp);
4250 if (start && end)
4251 mmu_notifier_invalidate_range_end(mm, *start, *end);
4252 }
4253
4254 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
4255 goto out;
4256
4257 if (start && end) {
4258 *start = address & PAGE_MASK;
4259 *end = *start + PAGE_SIZE;
4260 mmu_notifier_invalidate_range_start(mm, *start, *end);
4261 }
4262 ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
4263 if (!pte_present(*ptep))
4264 goto unlock;
4265 *ptepp = ptep;
4266 return 0;
4267unlock:
4268 pte_unmap_unlock(ptep, *ptlp);
4269 if (start && end)
4270 mmu_notifier_invalidate_range_end(mm, *start, *end);
4271out:
4272 return -EINVAL;
4273}
4274
4275static inline int follow_pte(struct mm_struct *mm, unsigned long address,
4276 pte_t **ptepp, spinlock_t **ptlp)
4277{
4278 int res;
4279
4280
4281 (void) __cond_lock(*ptlp,
4282 !(res = __follow_pte_pmd(mm, address, NULL, NULL,
4283 ptepp, NULL, ptlp)));
4284 return res;
4285}
4286
4287int follow_pte_pmd(struct mm_struct *mm, unsigned long address,
4288 unsigned long *start, unsigned long *end,
4289 pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
4290{
4291 int res;
4292
4293
4294 (void) __cond_lock(*ptlp,
4295 !(res = __follow_pte_pmd(mm, address, start, end,
4296 ptepp, pmdpp, ptlp)));
4297 return res;
4298}
4299EXPORT_SYMBOL(follow_pte_pmd);
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311int follow_pfn(struct vm_area_struct *vma, unsigned long address,
4312 unsigned long *pfn)
4313{
4314 int ret = -EINVAL;
4315 spinlock_t *ptl;
4316 pte_t *ptep;
4317
4318 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
4319 return ret;
4320
4321 ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
4322 if (ret)
4323 return ret;
4324 *pfn = pte_pfn(*ptep);
4325 pte_unmap_unlock(ptep, ptl);
4326 return 0;
4327}
4328EXPORT_SYMBOL(follow_pfn);
4329
4330#ifdef CONFIG_HAVE_IOREMAP_PROT
4331int follow_phys(struct vm_area_struct *vma,
4332 unsigned long address, unsigned int flags,
4333 unsigned long *prot, resource_size_t *phys)
4334{
4335 int ret = -EINVAL;
4336 pte_t *ptep, pte;
4337 spinlock_t *ptl;
4338
4339 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
4340 goto out;
4341
4342 if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
4343 goto out;
4344 pte = *ptep;
4345
4346 if ((flags & FOLL_WRITE) && !pte_write(pte))
4347 goto unlock;
4348
4349 *prot = pgprot_val(pte_pgprot(pte));
4350 *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
4351
4352 ret = 0;
4353unlock:
4354 pte_unmap_unlock(ptep, ptl);
4355out:
4356 return ret;
4357}
4358
4359int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
4360 void *buf, int len, int write)
4361{
4362 resource_size_t phys_addr;
4363 unsigned long prot = 0;
4364 void __iomem *maddr;
4365 int offset = addr & (PAGE_SIZE-1);
4366
4367 if (follow_phys(vma, addr, write, &prot, &phys_addr))
4368 return -EINVAL;
4369
4370 maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
4371 if (write)
4372 memcpy_toio(maddr + offset, buf, len);
4373 else
4374 memcpy_fromio(buf, maddr + offset, len);
4375 iounmap(maddr);
4376
4377 return len;
4378}
4379EXPORT_SYMBOL_GPL(generic_access_phys);
4380#endif
4381
4382
4383
4384
4385
4386int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
4387 unsigned long addr, void *buf, int len, unsigned int gup_flags)
4388{
4389 struct vm_area_struct *vma;
4390 void *old_buf = buf;
4391 int write = gup_flags & FOLL_WRITE;
4392
4393 down_read(&mm->mmap_sem);
4394
4395 while (len) {
4396 int bytes, ret, offset;
4397 void *maddr;
4398 struct page *page = NULL;
4399
4400 ret = get_user_pages_remote(tsk, mm, addr, 1,
4401 gup_flags, &page, &vma, NULL);
4402 if (ret <= 0) {
4403#ifndef CONFIG_HAVE_IOREMAP_PROT
4404 break;
4405#else
4406
4407
4408
4409
4410 vma = find_vma(mm, addr);
4411 if (!vma || vma->vm_start > addr)
4412 break;
4413 if (vma->vm_ops && vma->vm_ops->access)
4414 ret = vma->vm_ops->access(vma, addr, buf,
4415 len, write);
4416 if (ret <= 0)
4417 break;
4418 bytes = ret;
4419#endif
4420 } else {
4421 bytes = len;
4422 offset = addr & (PAGE_SIZE-1);
4423 if (bytes > PAGE_SIZE-offset)
4424 bytes = PAGE_SIZE-offset;
4425
4426 maddr = kmap(page);
4427 if (write) {
4428 copy_to_user_page(vma, page, addr,
4429 maddr + offset, buf, bytes);
4430 set_page_dirty_lock(page);
4431 } else {
4432 copy_from_user_page(vma, page, addr,
4433 buf, maddr + offset, bytes);
4434 }
4435 kunmap(page);
4436 put_page(page);
4437 }
4438 len -= bytes;
4439 buf += bytes;
4440 addr += bytes;
4441 }
4442 up_read(&mm->mmap_sem);
4443
4444 return buf - old_buf;
4445}
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457int access_remote_vm(struct mm_struct *mm, unsigned long addr,
4458 void *buf, int len, unsigned int gup_flags)
4459{
4460 return __access_remote_vm(NULL, mm, addr, buf, len, gup_flags);
4461}
4462
4463
4464
4465
4466
4467
4468int access_process_vm(struct task_struct *tsk, unsigned long addr,
4469 void *buf, int len, unsigned int gup_flags)
4470{
4471 struct mm_struct *mm;
4472 int ret;
4473
4474 mm = get_task_mm(tsk);
4475 if (!mm)
4476 return 0;
4477
4478 ret = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags);
4479
4480 mmput(mm);
4481
4482 return ret;
4483}
4484EXPORT_SYMBOL_GPL(access_process_vm);
4485
4486
4487
4488
4489void print_vma_addr(char *prefix, unsigned long ip)
4490{
4491 struct mm_struct *mm = current->mm;
4492 struct vm_area_struct *vma;
4493
4494
4495
4496
4497 if (!down_read_trylock(&mm->mmap_sem))
4498 return;
4499
4500 vma = find_vma(mm, ip);
4501 if (vma && vma->vm_file) {
4502 struct file *f = vma->vm_file;
4503 char *buf = (char *)__get_free_page(GFP_NOWAIT);
4504 if (buf) {
4505 char *p;
4506
4507 p = file_path(f, buf, PAGE_SIZE);
4508 if (IS_ERR(p))
4509 p = "?";
4510 printk("%s%s[%lx+%lx]", prefix, kbasename(p),
4511 vma->vm_start,
4512 vma->vm_end - vma->vm_start);
4513 free_page((unsigned long)buf);
4514 }
4515 }
4516 up_read(&mm->mmap_sem);
4517}
4518
4519#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
4520void __might_fault(const char *file, int line)
4521{
4522
4523
4524
4525
4526
4527
4528 if (uaccess_kernel())
4529 return;
4530 if (pagefault_disabled())
4531 return;
4532 __might_sleep(file, line, 0);
4533#if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
4534 if (current->mm)
4535 might_lock_read(¤t->mm->mmap_sem);
4536#endif
4537}
4538EXPORT_SYMBOL(__might_fault);
4539#endif
4540
4541#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
4542static void clear_gigantic_page(struct page *page,
4543 unsigned long addr,
4544 unsigned int pages_per_huge_page)
4545{
4546 int i;
4547 struct page *p = page;
4548
4549 might_sleep();
4550 for (i = 0; i < pages_per_huge_page;
4551 i++, p = mem_map_next(p, page, i)) {
4552 cond_resched();
4553 clear_user_highpage(p, addr + i * PAGE_SIZE);
4554 }
4555}
4556void clear_huge_page(struct page *page,
4557 unsigned long addr_hint, unsigned int pages_per_huge_page)
4558{
4559 int i, n, base, l;
4560 unsigned long addr = addr_hint &
4561 ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
4562
4563 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
4564 clear_gigantic_page(page, addr, pages_per_huge_page);
4565 return;
4566 }
4567
4568
4569 might_sleep();
4570 n = (addr_hint - addr) / PAGE_SIZE;
4571 if (2 * n <= pages_per_huge_page) {
4572
4573 base = 0;
4574 l = n;
4575
4576 for (i = pages_per_huge_page - 1; i >= 2 * n; i--) {
4577 cond_resched();
4578 clear_user_highpage(page + i, addr + i * PAGE_SIZE);
4579 }
4580 } else {
4581
4582 base = pages_per_huge_page - 2 * (pages_per_huge_page - n);
4583 l = pages_per_huge_page - n;
4584
4585 for (i = 0; i < base; i++) {
4586 cond_resched();
4587 clear_user_highpage(page + i, addr + i * PAGE_SIZE);
4588 }
4589 }
4590
4591
4592
4593
4594 for (i = 0; i < l; i++) {
4595 int left_idx = base + i;
4596 int right_idx = base + 2 * l - 1 - i;
4597
4598 cond_resched();
4599 clear_user_highpage(page + left_idx,
4600 addr + left_idx * PAGE_SIZE);
4601 cond_resched();
4602 clear_user_highpage(page + right_idx,
4603 addr + right_idx * PAGE_SIZE);
4604 }
4605}
4606
4607static void copy_user_gigantic_page(struct page *dst, struct page *src,
4608 unsigned long addr,
4609 struct vm_area_struct *vma,
4610 unsigned int pages_per_huge_page)
4611{
4612 int i;
4613 struct page *dst_base = dst;
4614 struct page *src_base = src;
4615
4616 for (i = 0; i < pages_per_huge_page; ) {
4617 cond_resched();
4618 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
4619
4620 i++;
4621 dst = mem_map_next(dst, dst_base, i);
4622 src = mem_map_next(src, src_base, i);
4623 }
4624}
4625
4626void copy_user_huge_page(struct page *dst, struct page *src,
4627 unsigned long addr, struct vm_area_struct *vma,
4628 unsigned int pages_per_huge_page)
4629{
4630 int i;
4631
4632 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
4633 copy_user_gigantic_page(dst, src, addr, vma,
4634 pages_per_huge_page);
4635 return;
4636 }
4637
4638 might_sleep();
4639 for (i = 0; i < pages_per_huge_page; i++) {
4640 cond_resched();
4641 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
4642 }
4643}
4644
4645long copy_huge_page_from_user(struct page *dst_page,
4646 const void __user *usr_src,
4647 unsigned int pages_per_huge_page,
4648 bool allow_pagefault)
4649{
4650 void *src = (void *)usr_src;
4651 void *page_kaddr;
4652 unsigned long i, rc = 0;
4653 unsigned long ret_val = pages_per_huge_page * PAGE_SIZE;
4654
4655 for (i = 0; i < pages_per_huge_page; i++) {
4656 if (allow_pagefault)
4657 page_kaddr = kmap(dst_page + i);
4658 else
4659 page_kaddr = kmap_atomic(dst_page + i);
4660 rc = copy_from_user(page_kaddr,
4661 (const void __user *)(src + i * PAGE_SIZE),
4662 PAGE_SIZE);
4663 if (allow_pagefault)
4664 kunmap(dst_page + i);
4665 else
4666 kunmap_atomic(page_kaddr);
4667
4668 ret_val -= (PAGE_SIZE - rc);
4669 if (rc)
4670 break;
4671
4672 cond_resched();
4673 }
4674 return ret_val;
4675}
4676#endif
4677
4678#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS
4679
4680static struct kmem_cache *page_ptl_cachep;
4681
4682void __init ptlock_cache_init(void)
4683{
4684 page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0,
4685 SLAB_PANIC, NULL);
4686}
4687
4688bool ptlock_alloc(struct page *page)
4689{
4690 spinlock_t *ptl;
4691
4692 ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
4693 if (!ptl)
4694 return false;
4695 page->ptl = ptl;
4696 return true;
4697}
4698
4699void ptlock_free(struct page *page)
4700{
4701 kmem_cache_free(page_ptl_cachep, page->ptl);
4702}
4703#endif
4704