1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41#include <linux/kernel_stat.h>
42#include <linux/mm.h>
43#include <linux/sched/mm.h>
44#include <linux/sched/coredump.h>
45#include <linux/sched/numa_balancing.h>
46#include <linux/sched/task.h>
47#include <linux/hugetlb.h>
48#include <linux/mman.h>
49#include <linux/swap.h>
50#include <linux/highmem.h>
51#include <linux/pagemap.h>
52#include <linux/memremap.h>
53#include <linux/ksm.h>
54#include <linux/rmap.h>
55#include <linux/export.h>
56#include <linux/delayacct.h>
57#include <linux/init.h>
58#include <linux/pfn_t.h>
59#include <linux/writeback.h>
60#include <linux/memcontrol.h>
61#include <linux/mmu_notifier.h>
62#include <linux/kallsyms.h>
63#include <linux/swapops.h>
64#include <linux/elf.h>
65#include <linux/gfp.h>
66#include <linux/migrate.h>
67#include <linux/string.h>
68#include <linux/dma-debug.h>
69#include <linux/debugfs.h>
70#include <linux/userfaultfd_k.h>
71#include <linux/dax.h>
72#include <linux/oom.h>
73
74#include <asm/io.h>
75#include <asm/mmu_context.h>
76#include <asm/pgalloc.h>
77#include <linux/uaccess.h>
78#include <asm/tlb.h>
79#include <asm/tlbflush.h>
80#include <asm/pgtable.h>
81
82#include "internal.h"
83
84#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
85#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
86#endif
87
88#ifndef CONFIG_NEED_MULTIPLE_NODES
89
90unsigned long max_mapnr;
91EXPORT_SYMBOL(max_mapnr);
92
93struct page *mem_map;
94EXPORT_SYMBOL(mem_map);
95#endif
96
97
98
99
100
101
102
103
104void *high_memory;
105EXPORT_SYMBOL(high_memory);
106
107
108
109
110
111
112
113int randomize_va_space __read_mostly =
114#ifdef CONFIG_COMPAT_BRK
115 1;
116#else
117 2;
118#endif
119
120static int __init disable_randmaps(char *s)
121{
122 randomize_va_space = 0;
123 return 1;
124}
125__setup("norandmaps", disable_randmaps);
126
127unsigned long zero_pfn __read_mostly;
128EXPORT_SYMBOL(zero_pfn);
129
130unsigned long highest_memmap_pfn __read_mostly;
131
132
133
134
135static int __init init_zero_pfn(void)
136{
137 zero_pfn = page_to_pfn(ZERO_PAGE(0));
138 return 0;
139}
140core_initcall(init_zero_pfn);
141
142
143#if defined(SPLIT_RSS_COUNTING)
144
145void sync_mm_rss(struct mm_struct *mm)
146{
147 int i;
148
149 for (i = 0; i < NR_MM_COUNTERS; i++) {
150 if (current->rss_stat.count[i]) {
151 add_mm_counter(mm, i, current->rss_stat.count[i]);
152 current->rss_stat.count[i] = 0;
153 }
154 }
155 current->rss_stat.events = 0;
156}
157
158static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
159{
160 struct task_struct *task = current;
161
162 if (likely(task->mm == mm))
163 task->rss_stat.count[member] += val;
164 else
165 add_mm_counter(mm, member, val);
166}
167#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
168#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
169
170
171#define TASK_RSS_EVENTS_THRESH (64)
172static void check_sync_rss_stat(struct task_struct *task)
173{
174 if (unlikely(task != current))
175 return;
176 if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
177 sync_mm_rss(task->mm);
178}
179#else
180
181#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
182#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
183
184static void check_sync_rss_stat(struct task_struct *task)
185{
186}
187
188#endif
189
190#ifdef HAVE_GENERIC_MMU_GATHER
191
192static bool tlb_next_batch(struct mmu_gather *tlb)
193{
194 struct mmu_gather_batch *batch;
195
196 batch = tlb->active;
197 if (batch->next) {
198 tlb->active = batch->next;
199 return true;
200 }
201
202 if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
203 return false;
204
205 batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
206 if (!batch)
207 return false;
208
209 tlb->batch_count++;
210 batch->next = NULL;
211 batch->nr = 0;
212 batch->max = MAX_GATHER_BATCH;
213
214 tlb->active->next = batch;
215 tlb->active = batch;
216
217 return true;
218}
219
220void arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
221 unsigned long start, unsigned long end)
222{
223 tlb->mm = mm;
224
225
226 tlb->fullmm = !(start | (end+1));
227 tlb->need_flush_all = 0;
228 tlb->local.next = NULL;
229 tlb->local.nr = 0;
230 tlb->local.max = ARRAY_SIZE(tlb->__pages);
231 tlb->active = &tlb->local;
232 tlb->batch_count = 0;
233
234#ifdef CONFIG_HAVE_RCU_TABLE_FREE
235 tlb->batch = NULL;
236#endif
237 tlb->page_size = 0;
238
239 __tlb_reset_range(tlb);
240}
241
242static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
243{
244 if (!tlb->end)
245 return;
246
247 tlb_flush(tlb);
248 mmu_notifier_invalidate_range(tlb->mm, tlb->start, tlb->end);
249#ifdef CONFIG_HAVE_RCU_TABLE_FREE
250 tlb_table_flush(tlb);
251#endif
252 __tlb_reset_range(tlb);
253}
254
255static void tlb_flush_mmu_free(struct mmu_gather *tlb)
256{
257 struct mmu_gather_batch *batch;
258
259 for (batch = &tlb->local; batch && batch->nr; batch = batch->next) {
260 free_pages_and_swap_cache(batch->pages, batch->nr);
261 batch->nr = 0;
262 }
263 tlb->active = &tlb->local;
264}
265
266void tlb_flush_mmu(struct mmu_gather *tlb)
267{
268 tlb_flush_mmu_tlbonly(tlb);
269 tlb_flush_mmu_free(tlb);
270}
271
272
273
274
275
276void arch_tlb_finish_mmu(struct mmu_gather *tlb,
277 unsigned long start, unsigned long end, bool force)
278{
279 struct mmu_gather_batch *batch, *next;
280
281 if (force)
282 __tlb_adjust_range(tlb, start, end - start);
283
284 tlb_flush_mmu(tlb);
285
286
287 check_pgt_cache();
288
289 for (batch = tlb->local.next; batch; batch = next) {
290 next = batch->next;
291 free_pages((unsigned long)batch, 0);
292 }
293 tlb->local.next = NULL;
294}
295
296
297
298
299
300
301
302
303bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size)
304{
305 struct mmu_gather_batch *batch;
306
307 VM_BUG_ON(!tlb->end);
308 VM_WARN_ON(tlb->page_size != page_size);
309
310 batch = tlb->active;
311
312
313
314
315 batch->pages[batch->nr++] = page;
316 if (batch->nr == batch->max) {
317 if (!tlb_next_batch(tlb))
318 return true;
319 batch = tlb->active;
320 }
321 VM_BUG_ON_PAGE(batch->nr > batch->max, page);
322
323 return false;
324}
325
326#endif
327
328#ifdef CONFIG_HAVE_RCU_TABLE_FREE
329
330
331
332
333
334static void tlb_remove_table_smp_sync(void *arg)
335{
336
337}
338
339static void tlb_remove_table_one(void *table)
340{
341
342
343
344
345
346
347
348 smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
349 __tlb_remove_table(table);
350}
351
352static void tlb_remove_table_rcu(struct rcu_head *head)
353{
354 struct mmu_table_batch *batch;
355 int i;
356
357 batch = container_of(head, struct mmu_table_batch, rcu);
358
359 for (i = 0; i < batch->nr; i++)
360 __tlb_remove_table(batch->tables[i]);
361
362 free_page((unsigned long)batch);
363}
364
365void tlb_table_flush(struct mmu_gather *tlb)
366{
367 struct mmu_table_batch **batch = &tlb->batch;
368
369 if (*batch) {
370 call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
371 *batch = NULL;
372 }
373}
374
375void tlb_remove_table(struct mmu_gather *tlb, void *table)
376{
377 struct mmu_table_batch **batch = &tlb->batch;
378
379
380
381
382
383 if (atomic_read(&tlb->mm->mm_users) < 2) {
384 __tlb_remove_table(table);
385 return;
386 }
387
388 if (*batch == NULL) {
389 *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
390 if (*batch == NULL) {
391 tlb_remove_table_one(table);
392 return;
393 }
394 (*batch)->nr = 0;
395 }
396 (*batch)->tables[(*batch)->nr++] = table;
397 if ((*batch)->nr == MAX_TABLE_BATCH)
398 tlb_table_flush(tlb);
399}
400
401#endif
402
403
404
405
406
407
408void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
409 unsigned long start, unsigned long end)
410{
411 arch_tlb_gather_mmu(tlb, mm, start, end);
412 inc_tlb_flush_pending(tlb->mm);
413}
414
415void tlb_finish_mmu(struct mmu_gather *tlb,
416 unsigned long start, unsigned long end)
417{
418
419
420
421
422
423
424
425 bool force = mm_tlb_flush_nested(tlb->mm);
426
427 arch_tlb_finish_mmu(tlb, start, end, force);
428 dec_tlb_flush_pending(tlb->mm);
429}
430
431
432
433
434
435static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
436 unsigned long addr)
437{
438 pgtable_t token = pmd_pgtable(*pmd);
439 pmd_clear(pmd);
440 pte_free_tlb(tlb, token, addr);
441 atomic_long_dec(&tlb->mm->nr_ptes);
442}
443
444static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
445 unsigned long addr, unsigned long end,
446 unsigned long floor, unsigned long ceiling)
447{
448 pmd_t *pmd;
449 unsigned long next;
450 unsigned long start;
451
452 start = addr;
453 pmd = pmd_offset(pud, addr);
454 do {
455 next = pmd_addr_end(addr, end);
456 if (pmd_none_or_clear_bad(pmd))
457 continue;
458 free_pte_range(tlb, pmd, addr);
459 } while (pmd++, addr = next, addr != end);
460
461 start &= PUD_MASK;
462 if (start < floor)
463 return;
464 if (ceiling) {
465 ceiling &= PUD_MASK;
466 if (!ceiling)
467 return;
468 }
469 if (end - 1 > ceiling - 1)
470 return;
471
472 pmd = pmd_offset(pud, start);
473 pud_clear(pud);
474 pmd_free_tlb(tlb, pmd, start);
475 mm_dec_nr_pmds(tlb->mm);
476}
477
478static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
479 unsigned long addr, unsigned long end,
480 unsigned long floor, unsigned long ceiling)
481{
482 pud_t *pud;
483 unsigned long next;
484 unsigned long start;
485
486 start = addr;
487 pud = pud_offset(p4d, addr);
488 do {
489 next = pud_addr_end(addr, end);
490 if (pud_none_or_clear_bad(pud))
491 continue;
492 free_pmd_range(tlb, pud, addr, next, floor, ceiling);
493 } while (pud++, addr = next, addr != end);
494
495 start &= P4D_MASK;
496 if (start < floor)
497 return;
498 if (ceiling) {
499 ceiling &= P4D_MASK;
500 if (!ceiling)
501 return;
502 }
503 if (end - 1 > ceiling - 1)
504 return;
505
506 pud = pud_offset(p4d, start);
507 p4d_clear(p4d);
508 pud_free_tlb(tlb, pud, start);
509}
510
511static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
512 unsigned long addr, unsigned long end,
513 unsigned long floor, unsigned long ceiling)
514{
515 p4d_t *p4d;
516 unsigned long next;
517 unsigned long start;
518
519 start = addr;
520 p4d = p4d_offset(pgd, addr);
521 do {
522 next = p4d_addr_end(addr, end);
523 if (p4d_none_or_clear_bad(p4d))
524 continue;
525 free_pud_range(tlb, p4d, addr, next, floor, ceiling);
526 } while (p4d++, addr = next, addr != end);
527
528 start &= PGDIR_MASK;
529 if (start < floor)
530 return;
531 if (ceiling) {
532 ceiling &= PGDIR_MASK;
533 if (!ceiling)
534 return;
535 }
536 if (end - 1 > ceiling - 1)
537 return;
538
539 p4d = p4d_offset(pgd, start);
540 pgd_clear(pgd);
541 p4d_free_tlb(tlb, p4d, start);
542}
543
544
545
546
547void free_pgd_range(struct mmu_gather *tlb,
548 unsigned long addr, unsigned long end,
549 unsigned long floor, unsigned long ceiling)
550{
551 pgd_t *pgd;
552 unsigned long next;
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580 addr &= PMD_MASK;
581 if (addr < floor) {
582 addr += PMD_SIZE;
583 if (!addr)
584 return;
585 }
586 if (ceiling) {
587 ceiling &= PMD_MASK;
588 if (!ceiling)
589 return;
590 }
591 if (end - 1 > ceiling - 1)
592 end -= PMD_SIZE;
593 if (addr > end - 1)
594 return;
595
596
597
598
599 tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
600 pgd = pgd_offset(tlb->mm, addr);
601 do {
602 next = pgd_addr_end(addr, end);
603 if (pgd_none_or_clear_bad(pgd))
604 continue;
605 free_p4d_range(tlb, pgd, addr, next, floor, ceiling);
606 } while (pgd++, addr = next, addr != end);
607}
608
609void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
610 unsigned long floor, unsigned long ceiling)
611{
612 while (vma) {
613 struct vm_area_struct *next = vma->vm_next;
614 unsigned long addr = vma->vm_start;
615
616
617
618
619
620 unlink_anon_vmas(vma);
621 unlink_file_vma(vma);
622
623 if (is_vm_hugetlb_page(vma)) {
624 hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
625 floor, next ? next->vm_start : ceiling);
626 } else {
627
628
629
630 while (next && next->vm_start <= vma->vm_end + PMD_SIZE
631 && !is_vm_hugetlb_page(next)) {
632 vma = next;
633 next = vma->vm_next;
634 unlink_anon_vmas(vma);
635 unlink_file_vma(vma);
636 }
637 free_pgd_range(tlb, addr, vma->vm_end,
638 floor, next ? next->vm_start : ceiling);
639 }
640 vma = next;
641 }
642}
643
644int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
645{
646 spinlock_t *ptl;
647 pgtable_t new = pte_alloc_one(mm, address);
648 if (!new)
649 return -ENOMEM;
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664 smp_wmb();
665
666 ptl = pmd_lock(mm, pmd);
667 if (likely(pmd_none(*pmd))) {
668 atomic_long_inc(&mm->nr_ptes);
669 pmd_populate(mm, pmd, new);
670 new = NULL;
671 }
672 spin_unlock(ptl);
673 if (new)
674 pte_free(mm, new);
675 return 0;
676}
677
678int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
679{
680 pte_t *new = pte_alloc_one_kernel(&init_mm, address);
681 if (!new)
682 return -ENOMEM;
683
684 smp_wmb();
685
686 spin_lock(&init_mm.page_table_lock);
687 if (likely(pmd_none(*pmd))) {
688 pmd_populate_kernel(&init_mm, pmd, new);
689 new = NULL;
690 }
691 spin_unlock(&init_mm.page_table_lock);
692 if (new)
693 pte_free_kernel(&init_mm, new);
694 return 0;
695}
696
697static inline void init_rss_vec(int *rss)
698{
699 memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
700}
701
702static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
703{
704 int i;
705
706 if (current->mm == mm)
707 sync_mm_rss(mm);
708 for (i = 0; i < NR_MM_COUNTERS; i++)
709 if (rss[i])
710 add_mm_counter(mm, i, rss[i]);
711}
712
713
714
715
716
717
718
719
720static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
721 pte_t pte, struct page *page)
722{
723 pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
724 p4d_t *p4d = p4d_offset(pgd, addr);
725 pud_t *pud = pud_offset(p4d, addr);
726 pmd_t *pmd = pmd_offset(pud, addr);
727 struct address_space *mapping;
728 pgoff_t index;
729 static unsigned long resume;
730 static unsigned long nr_shown;
731 static unsigned long nr_unshown;
732
733
734
735
736
737 if (nr_shown == 60) {
738 if (time_before(jiffies, resume)) {
739 nr_unshown++;
740 return;
741 }
742 if (nr_unshown) {
743 pr_alert("BUG: Bad page map: %lu messages suppressed\n",
744 nr_unshown);
745 nr_unshown = 0;
746 }
747 nr_shown = 0;
748 }
749 if (nr_shown++ == 0)
750 resume = jiffies + 60 * HZ;
751
752 mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
753 index = linear_page_index(vma, addr);
754
755 pr_alert("BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
756 current->comm,
757 (long long)pte_val(pte), (long long)pmd_val(*pmd));
758 if (page)
759 dump_page(page, "bad pte");
760 pr_alert("addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
761 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
762
763
764
765 pr_alert("file:%pD fault:%pf mmap:%pf readpage:%pf\n",
766 vma->vm_file,
767 vma->vm_ops ? vma->vm_ops->fault : NULL,
768 vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
769 mapping ? mapping->a_ops->readpage : NULL);
770 dump_stack();
771 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
772}
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816#ifdef __HAVE_ARCH_PTE_SPECIAL
817# define HAVE_PTE_SPECIAL 1
818#else
819# define HAVE_PTE_SPECIAL 0
820#endif
821struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
822 pte_t pte, bool with_public_device)
823{
824 unsigned long pfn = pte_pfn(pte);
825
826 if (HAVE_PTE_SPECIAL) {
827 if (likely(!pte_special(pte)))
828 goto check_pfn;
829 if (vma->vm_ops && vma->vm_ops->find_special_page)
830 return vma->vm_ops->find_special_page(vma, addr);
831 if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
832 return NULL;
833 if (is_zero_pfn(pfn))
834 return NULL;
835
836
837
838
839
840
841
842
843
844
845
846
847
848 if (likely(pfn <= highest_memmap_pfn)) {
849 struct page *page = pfn_to_page(pfn);
850
851 if (is_device_public_page(page)) {
852 if (with_public_device)
853 return page;
854 return NULL;
855 }
856 }
857 print_bad_pte(vma, addr, pte, NULL);
858 return NULL;
859 }
860
861
862
863 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
864 if (vma->vm_flags & VM_MIXEDMAP) {
865 if (!pfn_valid(pfn))
866 return NULL;
867 goto out;
868 } else {
869 unsigned long off;
870 off = (addr - vma->vm_start) >> PAGE_SHIFT;
871 if (pfn == vma->vm_pgoff + off)
872 return NULL;
873 if (!is_cow_mapping(vma->vm_flags))
874 return NULL;
875 }
876 }
877
878 if (is_zero_pfn(pfn))
879 return NULL;
880check_pfn:
881 if (unlikely(pfn > highest_memmap_pfn)) {
882 print_bad_pte(vma, addr, pte, NULL);
883 return NULL;
884 }
885
886
887
888
889
890out:
891 return pfn_to_page(pfn);
892}
893
894#ifdef CONFIG_TRANSPARENT_HUGEPAGE
895struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
896 pmd_t pmd)
897{
898 unsigned long pfn = pmd_pfn(pmd);
899
900
901
902
903
904
905 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
906 if (vma->vm_flags & VM_MIXEDMAP) {
907 if (!pfn_valid(pfn))
908 return NULL;
909 goto out;
910 } else {
911 unsigned long off;
912 off = (addr - vma->vm_start) >> PAGE_SHIFT;
913 if (pfn == vma->vm_pgoff + off)
914 return NULL;
915 if (!is_cow_mapping(vma->vm_flags))
916 return NULL;
917 }
918 }
919
920 if (is_zero_pfn(pfn))
921 return NULL;
922 if (unlikely(pfn > highest_memmap_pfn))
923 return NULL;
924
925
926
927
928
929out:
930 return pfn_to_page(pfn);
931}
932#endif
933
934
935
936
937
938
939
940static inline unsigned long
941copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
942 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
943 unsigned long addr, int *rss)
944{
945 unsigned long vm_flags = vma->vm_flags;
946 pte_t pte = *src_pte;
947 struct page *page;
948
949
950 if (unlikely(!pte_present(pte))) {
951 swp_entry_t entry = pte_to_swp_entry(pte);
952
953 if (likely(!non_swap_entry(entry))) {
954 if (swap_duplicate(entry) < 0)
955 return entry.val;
956
957
958 if (unlikely(list_empty(&dst_mm->mmlist))) {
959 spin_lock(&mmlist_lock);
960 if (list_empty(&dst_mm->mmlist))
961 list_add(&dst_mm->mmlist,
962 &src_mm->mmlist);
963 spin_unlock(&mmlist_lock);
964 }
965 rss[MM_SWAPENTS]++;
966 } else if (is_migration_entry(entry)) {
967 page = migration_entry_to_page(entry);
968
969 rss[mm_counter(page)]++;
970
971 if (is_write_migration_entry(entry) &&
972 is_cow_mapping(vm_flags)) {
973
974
975
976
977 make_migration_entry_read(&entry);
978 pte = swp_entry_to_pte(entry);
979 if (pte_swp_soft_dirty(*src_pte))
980 pte = pte_swp_mksoft_dirty(pte);
981 set_pte_at(src_mm, addr, src_pte, pte);
982 }
983 } else if (is_device_private_entry(entry)) {
984 page = device_private_entry_to_page(entry);
985
986
987
988
989
990
991
992
993
994
995 get_page(page);
996 rss[mm_counter(page)]++;
997 page_dup_rmap(page, false);
998
999
1000
1001
1002
1003
1004
1005
1006 if (is_write_device_private_entry(entry) &&
1007 is_cow_mapping(vm_flags)) {
1008 make_device_private_entry_read(&entry);
1009 pte = swp_entry_to_pte(entry);
1010 set_pte_at(src_mm, addr, src_pte, pte);
1011 }
1012 }
1013 goto out_set_pte;
1014 }
1015
1016
1017
1018
1019
1020 if (is_cow_mapping(vm_flags)) {
1021 ptep_set_wrprotect(src_mm, addr, src_pte);
1022 pte = pte_wrprotect(pte);
1023 }
1024
1025
1026
1027
1028
1029 if (vm_flags & VM_SHARED)
1030 pte = pte_mkclean(pte);
1031 pte = pte_mkold(pte);
1032
1033 page = vm_normal_page(vma, addr, pte);
1034 if (page) {
1035 get_page(page);
1036 page_dup_rmap(page, false);
1037 rss[mm_counter(page)]++;
1038 } else if (pte_devmap(pte)) {
1039 page = pte_page(pte);
1040
1041
1042
1043
1044
1045
1046 if (is_device_public_page(page)) {
1047 get_page(page);
1048 page_dup_rmap(page, false);
1049 rss[mm_counter(page)]++;
1050 }
1051 }
1052
1053out_set_pte:
1054 set_pte_at(dst_mm, addr, dst_pte, pte);
1055 return 0;
1056}
1057
1058static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1059 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
1060 unsigned long addr, unsigned long end)
1061{
1062 pte_t *orig_src_pte, *orig_dst_pte;
1063 pte_t *src_pte, *dst_pte;
1064 spinlock_t *src_ptl, *dst_ptl;
1065 int progress = 0;
1066 int rss[NR_MM_COUNTERS];
1067 swp_entry_t entry = (swp_entry_t){0};
1068
1069again:
1070 init_rss_vec(rss);
1071
1072 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
1073 if (!dst_pte)
1074 return -ENOMEM;
1075 src_pte = pte_offset_map(src_pmd, addr);
1076 src_ptl = pte_lockptr(src_mm, src_pmd);
1077 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
1078 orig_src_pte = src_pte;
1079 orig_dst_pte = dst_pte;
1080 arch_enter_lazy_mmu_mode();
1081
1082 do {
1083
1084
1085
1086
1087 if (progress >= 32) {
1088 progress = 0;
1089 if (need_resched() ||
1090 spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
1091 break;
1092 }
1093 if (pte_none(*src_pte)) {
1094 progress++;
1095 continue;
1096 }
1097 entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
1098 vma, addr, rss);
1099 if (entry.val)
1100 break;
1101 progress += 8;
1102 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
1103
1104 arch_leave_lazy_mmu_mode();
1105 spin_unlock(src_ptl);
1106 pte_unmap(orig_src_pte);
1107 add_mm_rss_vec(dst_mm, rss);
1108 pte_unmap_unlock(orig_dst_pte, dst_ptl);
1109 cond_resched();
1110
1111 if (entry.val) {
1112 if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
1113 return -ENOMEM;
1114 progress = 0;
1115 }
1116 if (addr != end)
1117 goto again;
1118 return 0;
1119}
1120
1121static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1122 pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
1123 unsigned long addr, unsigned long end)
1124{
1125 pmd_t *src_pmd, *dst_pmd;
1126 unsigned long next;
1127
1128 dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
1129 if (!dst_pmd)
1130 return -ENOMEM;
1131 src_pmd = pmd_offset(src_pud, addr);
1132 do {
1133 next = pmd_addr_end(addr, end);
1134 if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)
1135 || pmd_devmap(*src_pmd)) {
1136 int err;
1137 VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, vma);
1138 err = copy_huge_pmd(dst_mm, src_mm,
1139 dst_pmd, src_pmd, addr, vma);
1140 if (err == -ENOMEM)
1141 return -ENOMEM;
1142 if (!err)
1143 continue;
1144
1145 }
1146 if (pmd_none_or_clear_bad(src_pmd))
1147 continue;
1148 if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
1149 vma, addr, next))
1150 return -ENOMEM;
1151 } while (dst_pmd++, src_pmd++, addr = next, addr != end);
1152 return 0;
1153}
1154
1155static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1156 p4d_t *dst_p4d, p4d_t *src_p4d, struct vm_area_struct *vma,
1157 unsigned long addr, unsigned long end)
1158{
1159 pud_t *src_pud, *dst_pud;
1160 unsigned long next;
1161
1162 dst_pud = pud_alloc(dst_mm, dst_p4d, addr);
1163 if (!dst_pud)
1164 return -ENOMEM;
1165 src_pud = pud_offset(src_p4d, addr);
1166 do {
1167 next = pud_addr_end(addr, end);
1168 if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) {
1169 int err;
1170
1171 VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, vma);
1172 err = copy_huge_pud(dst_mm, src_mm,
1173 dst_pud, src_pud, addr, vma);
1174 if (err == -ENOMEM)
1175 return -ENOMEM;
1176 if (!err)
1177 continue;
1178
1179 }
1180 if (pud_none_or_clear_bad(src_pud))
1181 continue;
1182 if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
1183 vma, addr, next))
1184 return -ENOMEM;
1185 } while (dst_pud++, src_pud++, addr = next, addr != end);
1186 return 0;
1187}
1188
1189static inline int copy_p4d_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1190 pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
1191 unsigned long addr, unsigned long end)
1192{
1193 p4d_t *src_p4d, *dst_p4d;
1194 unsigned long next;
1195
1196 dst_p4d = p4d_alloc(dst_mm, dst_pgd, addr);
1197 if (!dst_p4d)
1198 return -ENOMEM;
1199 src_p4d = p4d_offset(src_pgd, addr);
1200 do {
1201 next = p4d_addr_end(addr, end);
1202 if (p4d_none_or_clear_bad(src_p4d))
1203 continue;
1204 if (copy_pud_range(dst_mm, src_mm, dst_p4d, src_p4d,
1205 vma, addr, next))
1206 return -ENOMEM;
1207 } while (dst_p4d++, src_p4d++, addr = next, addr != end);
1208 return 0;
1209}
1210
1211int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1212 struct vm_area_struct *vma)
1213{
1214 pgd_t *src_pgd, *dst_pgd;
1215 unsigned long next;
1216 unsigned long addr = vma->vm_start;
1217 unsigned long end = vma->vm_end;
1218 unsigned long mmun_start;
1219 unsigned long mmun_end;
1220 bool is_cow;
1221 int ret;
1222
1223
1224
1225
1226
1227
1228
1229 if (!(vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) &&
1230 !vma->anon_vma)
1231 return 0;
1232
1233 if (is_vm_hugetlb_page(vma))
1234 return copy_hugetlb_page_range(dst_mm, src_mm, vma);
1235
1236 if (unlikely(vma->vm_flags & VM_PFNMAP)) {
1237
1238
1239
1240
1241 ret = track_pfn_copy(vma);
1242 if (ret)
1243 return ret;
1244 }
1245
1246
1247
1248
1249
1250
1251
1252 is_cow = is_cow_mapping(vma->vm_flags);
1253 mmun_start = addr;
1254 mmun_end = end;
1255 if (is_cow)
1256 mmu_notifier_invalidate_range_start(src_mm, mmun_start,
1257 mmun_end);
1258
1259 ret = 0;
1260 dst_pgd = pgd_offset(dst_mm, addr);
1261 src_pgd = pgd_offset(src_mm, addr);
1262 do {
1263 next = pgd_addr_end(addr, end);
1264 if (pgd_none_or_clear_bad(src_pgd))
1265 continue;
1266 if (unlikely(copy_p4d_range(dst_mm, src_mm, dst_pgd, src_pgd,
1267 vma, addr, next))) {
1268 ret = -ENOMEM;
1269 break;
1270 }
1271 } while (dst_pgd++, src_pgd++, addr = next, addr != end);
1272
1273 if (is_cow)
1274 mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end);
1275 return ret;
1276}
1277
1278static unsigned long zap_pte_range(struct mmu_gather *tlb,
1279 struct vm_area_struct *vma, pmd_t *pmd,
1280 unsigned long addr, unsigned long end,
1281 struct zap_details *details)
1282{
1283 struct mm_struct *mm = tlb->mm;
1284 int force_flush = 0;
1285 int rss[NR_MM_COUNTERS];
1286 spinlock_t *ptl;
1287 pte_t *start_pte;
1288 pte_t *pte;
1289 swp_entry_t entry;
1290
1291 tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
1292again:
1293 init_rss_vec(rss);
1294 start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
1295 pte = start_pte;
1296 flush_tlb_batched_pending(mm);
1297 arch_enter_lazy_mmu_mode();
1298 do {
1299 pte_t ptent = *pte;
1300 if (pte_none(ptent))
1301 continue;
1302
1303 if (pte_present(ptent)) {
1304 struct page *page;
1305
1306 page = _vm_normal_page(vma, addr, ptent, true);
1307 if (unlikely(details) && page) {
1308
1309
1310
1311
1312
1313 if (details->check_mapping &&
1314 details->check_mapping != page_rmapping(page))
1315 continue;
1316 }
1317 ptent = ptep_get_and_clear_full(mm, addr, pte,
1318 tlb->fullmm);
1319 tlb_remove_tlb_entry(tlb, pte, addr);
1320 if (unlikely(!page))
1321 continue;
1322
1323 if (!PageAnon(page)) {
1324 if (pte_dirty(ptent)) {
1325 force_flush = 1;
1326 set_page_dirty(page);
1327 }
1328 if (pte_young(ptent) &&
1329 likely(!(vma->vm_flags & VM_SEQ_READ)))
1330 mark_page_accessed(page);
1331 }
1332 rss[mm_counter(page)]--;
1333 page_remove_rmap(page, false);
1334 if (unlikely(page_mapcount(page) < 0))
1335 print_bad_pte(vma, addr, ptent, page);
1336 if (unlikely(__tlb_remove_page(tlb, page))) {
1337 force_flush = 1;
1338 addr += PAGE_SIZE;
1339 break;
1340 }
1341 continue;
1342 }
1343
1344 entry = pte_to_swp_entry(ptent);
1345 if (non_swap_entry(entry) && is_device_private_entry(entry)) {
1346 struct page *page = device_private_entry_to_page(entry);
1347
1348 if (unlikely(details && details->check_mapping)) {
1349
1350
1351
1352
1353
1354 if (details->check_mapping !=
1355 page_rmapping(page))
1356 continue;
1357 }
1358
1359 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
1360 rss[mm_counter(page)]--;
1361 page_remove_rmap(page, false);
1362 put_page(page);
1363 continue;
1364 }
1365
1366
1367 if (unlikely(details))
1368 continue;
1369
1370 entry = pte_to_swp_entry(ptent);
1371 if (!non_swap_entry(entry))
1372 rss[MM_SWAPENTS]--;
1373 else if (is_migration_entry(entry)) {
1374 struct page *page;
1375
1376 page = migration_entry_to_page(entry);
1377 rss[mm_counter(page)]--;
1378 }
1379 if (unlikely(!free_swap_and_cache(entry)))
1380 print_bad_pte(vma, addr, ptent, NULL);
1381 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
1382 } while (pte++, addr += PAGE_SIZE, addr != end);
1383
1384 add_mm_rss_vec(mm, rss);
1385 arch_leave_lazy_mmu_mode();
1386
1387
1388 if (force_flush)
1389 tlb_flush_mmu_tlbonly(tlb);
1390 pte_unmap_unlock(start_pte, ptl);
1391
1392
1393
1394
1395
1396
1397
1398 if (force_flush) {
1399 force_flush = 0;
1400 tlb_flush_mmu_free(tlb);
1401 if (addr != end)
1402 goto again;
1403 }
1404
1405 return addr;
1406}
1407
1408static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1409 struct vm_area_struct *vma, pud_t *pud,
1410 unsigned long addr, unsigned long end,
1411 struct zap_details *details)
1412{
1413 pmd_t *pmd;
1414 unsigned long next;
1415
1416 pmd = pmd_offset(pud, addr);
1417 do {
1418 next = pmd_addr_end(addr, end);
1419 if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
1420 if (next - addr != HPAGE_PMD_SIZE) {
1421 VM_BUG_ON_VMA(vma_is_anonymous(vma) &&
1422 !rwsem_is_locked(&tlb->mm->mmap_sem), vma);
1423 __split_huge_pmd(vma, pmd, addr, false, NULL);
1424 } else if (zap_huge_pmd(tlb, vma, pmd, addr))
1425 goto next;
1426
1427 }
1428
1429
1430
1431
1432
1433
1434
1435 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
1436 goto next;
1437 next = zap_pte_range(tlb, vma, pmd, addr, next, details);
1438next:
1439 cond_resched();
1440 } while (pmd++, addr = next, addr != end);
1441
1442 return addr;
1443}
1444
1445static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1446 struct vm_area_struct *vma, p4d_t *p4d,
1447 unsigned long addr, unsigned long end,
1448 struct zap_details *details)
1449{
1450 pud_t *pud;
1451 unsigned long next;
1452
1453 pud = pud_offset(p4d, addr);
1454 do {
1455 next = pud_addr_end(addr, end);
1456 if (pud_trans_huge(*pud) || pud_devmap(*pud)) {
1457 if (next - addr != HPAGE_PUD_SIZE) {
1458 VM_BUG_ON_VMA(!rwsem_is_locked(&tlb->mm->mmap_sem), vma);
1459 split_huge_pud(vma, pud, addr);
1460 } else if (zap_huge_pud(tlb, vma, pud, addr))
1461 goto next;
1462
1463 }
1464 if (pud_none_or_clear_bad(pud))
1465 continue;
1466 next = zap_pmd_range(tlb, vma, pud, addr, next, details);
1467next:
1468 cond_resched();
1469 } while (pud++, addr = next, addr != end);
1470
1471 return addr;
1472}
1473
1474static inline unsigned long zap_p4d_range(struct mmu_gather *tlb,
1475 struct vm_area_struct *vma, pgd_t *pgd,
1476 unsigned long addr, unsigned long end,
1477 struct zap_details *details)
1478{
1479 p4d_t *p4d;
1480 unsigned long next;
1481
1482 p4d = p4d_offset(pgd, addr);
1483 do {
1484 next = p4d_addr_end(addr, end);
1485 if (p4d_none_or_clear_bad(p4d))
1486 continue;
1487 next = zap_pud_range(tlb, vma, p4d, addr, next, details);
1488 } while (p4d++, addr = next, addr != end);
1489
1490 return addr;
1491}
1492
1493void unmap_page_range(struct mmu_gather *tlb,
1494 struct vm_area_struct *vma,
1495 unsigned long addr, unsigned long end,
1496 struct zap_details *details)
1497{
1498 pgd_t *pgd;
1499 unsigned long next;
1500
1501 BUG_ON(addr >= end);
1502 tlb_start_vma(tlb, vma);
1503 pgd = pgd_offset(vma->vm_mm, addr);
1504 do {
1505 next = pgd_addr_end(addr, end);
1506 if (pgd_none_or_clear_bad(pgd))
1507 continue;
1508 next = zap_p4d_range(tlb, vma, pgd, addr, next, details);
1509 } while (pgd++, addr = next, addr != end);
1510 tlb_end_vma(tlb, vma);
1511}
1512
1513
1514static void unmap_single_vma(struct mmu_gather *tlb,
1515 struct vm_area_struct *vma, unsigned long start_addr,
1516 unsigned long end_addr,
1517 struct zap_details *details)
1518{
1519 unsigned long start = max(vma->vm_start, start_addr);
1520 unsigned long end;
1521
1522 if (start >= vma->vm_end)
1523 return;
1524 end = min(vma->vm_end, end_addr);
1525 if (end <= vma->vm_start)
1526 return;
1527
1528 if (vma->vm_file)
1529 uprobe_munmap(vma, start, end);
1530
1531 if (unlikely(vma->vm_flags & VM_PFNMAP))
1532 untrack_pfn(vma, 0, 0);
1533
1534 if (start != end) {
1535 if (unlikely(is_vm_hugetlb_page(vma))) {
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547 if (vma->vm_file) {
1548 i_mmap_lock_write(vma->vm_file->f_mapping);
1549 __unmap_hugepage_range_final(tlb, vma, start, end, NULL);
1550 i_mmap_unlock_write(vma->vm_file->f_mapping);
1551 }
1552 } else
1553 unmap_page_range(tlb, vma, start, end, details);
1554 }
1555}
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575void unmap_vmas(struct mmu_gather *tlb,
1576 struct vm_area_struct *vma, unsigned long start_addr,
1577 unsigned long end_addr)
1578{
1579 struct mm_struct *mm = vma->vm_mm;
1580
1581 mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
1582 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
1583 unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
1584 mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
1585}
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595void zap_page_range(struct vm_area_struct *vma, unsigned long start,
1596 unsigned long size)
1597{
1598 struct mm_struct *mm = vma->vm_mm;
1599 struct mmu_gather tlb;
1600 unsigned long end = start + size;
1601
1602 lru_add_drain();
1603 tlb_gather_mmu(&tlb, mm, start, end);
1604 update_hiwater_rss(mm);
1605 mmu_notifier_invalidate_range_start(mm, start, end);
1606 for ( ; vma && vma->vm_start < end; vma = vma->vm_next) {
1607 unmap_single_vma(&tlb, vma, start, end, NULL);
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617 flush_tlb_range(vma, start, end);
1618 }
1619
1620 mmu_notifier_invalidate_range_end(mm, start, end);
1621 tlb_finish_mmu(&tlb, start, end);
1622}
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
1634 unsigned long size, struct zap_details *details)
1635{
1636 struct mm_struct *mm = vma->vm_mm;
1637 struct mmu_gather tlb;
1638 unsigned long end = address + size;
1639
1640 lru_add_drain();
1641 tlb_gather_mmu(&tlb, mm, address, end);
1642 update_hiwater_rss(mm);
1643 mmu_notifier_invalidate_range_start(mm, address, end);
1644 unmap_single_vma(&tlb, vma, address, end, details);
1645 mmu_notifier_invalidate_range_end(mm, address, end);
1646 tlb_finish_mmu(&tlb, address, end);
1647}
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
1662 unsigned long size)
1663{
1664 if (address < vma->vm_start || address + size > vma->vm_end ||
1665 !(vma->vm_flags & VM_PFNMAP))
1666 return -1;
1667 zap_page_range_single(vma, address, size, NULL);
1668 return 0;
1669}
1670EXPORT_SYMBOL_GPL(zap_vma_ptes);
1671
1672pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
1673 spinlock_t **ptl)
1674{
1675 pgd_t *pgd;
1676 p4d_t *p4d;
1677 pud_t *pud;
1678 pmd_t *pmd;
1679
1680 pgd = pgd_offset(mm, addr);
1681 p4d = p4d_alloc(mm, pgd, addr);
1682 if (!p4d)
1683 return NULL;
1684 pud = pud_alloc(mm, p4d, addr);
1685 if (!pud)
1686 return NULL;
1687 pmd = pmd_alloc(mm, pud, addr);
1688 if (!pmd)
1689 return NULL;
1690
1691 VM_BUG_ON(pmd_trans_huge(*pmd));
1692 return pte_alloc_map_lock(mm, pmd, addr, ptl);
1693}
1694
1695
1696
1697
1698
1699
1700
1701
1702static int insert_page(struct vm_area_struct *vma, unsigned long addr,
1703 struct page *page, pgprot_t prot)
1704{
1705 struct mm_struct *mm = vma->vm_mm;
1706 int retval;
1707 pte_t *pte;
1708 spinlock_t *ptl;
1709
1710 retval = -EINVAL;
1711 if (PageAnon(page))
1712 goto out;
1713 retval = -ENOMEM;
1714 flush_dcache_page(page);
1715 pte = get_locked_pte(mm, addr, &ptl);
1716 if (!pte)
1717 goto out;
1718 retval = -EBUSY;
1719 if (!pte_none(*pte))
1720 goto out_unlock;
1721
1722
1723 get_page(page);
1724 inc_mm_counter_fast(mm, mm_counter_file(page));
1725 page_add_file_rmap(page, false);
1726 set_pte_at(mm, addr, pte, mk_pte(page, prot));
1727
1728 retval = 0;
1729 pte_unmap_unlock(pte, ptl);
1730 return retval;
1731out_unlock:
1732 pte_unmap_unlock(pte, ptl);
1733out:
1734 return retval;
1735}
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
1765 struct page *page)
1766{
1767 if (addr < vma->vm_start || addr >= vma->vm_end)
1768 return -EFAULT;
1769 if (!page_count(page))
1770 return -EINVAL;
1771 if (!(vma->vm_flags & VM_MIXEDMAP)) {
1772 BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem));
1773 BUG_ON(vma->vm_flags & VM_PFNMAP);
1774 vma->vm_flags |= VM_MIXEDMAP;
1775 }
1776 return insert_page(vma, addr, page, vma->vm_page_prot);
1777}
1778EXPORT_SYMBOL(vm_insert_page);
1779
1780static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1781 pfn_t pfn, pgprot_t prot, bool mkwrite)
1782{
1783 struct mm_struct *mm = vma->vm_mm;
1784 int retval;
1785 pte_t *pte, entry;
1786 spinlock_t *ptl;
1787
1788 retval = -ENOMEM;
1789 pte = get_locked_pte(mm, addr, &ptl);
1790 if (!pte)
1791 goto out;
1792 retval = -EBUSY;
1793 if (!pte_none(*pte)) {
1794 if (mkwrite) {
1795
1796
1797
1798
1799
1800
1801
1802 if (WARN_ON_ONCE(pte_pfn(*pte) != pfn_t_to_pfn(pfn)))
1803 goto out_unlock;
1804 entry = *pte;
1805 goto out_mkwrite;
1806 } else
1807 goto out_unlock;
1808 }
1809
1810
1811 if (pfn_t_devmap(pfn))
1812 entry = pte_mkdevmap(pfn_t_pte(pfn, prot));
1813 else
1814 entry = pte_mkspecial(pfn_t_pte(pfn, prot));
1815
1816out_mkwrite:
1817 if (mkwrite) {
1818 entry = pte_mkyoung(entry);
1819 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1820 }
1821
1822 set_pte_at(mm, addr, pte, entry);
1823 update_mmu_cache(vma, addr, pte);
1824
1825 retval = 0;
1826out_unlock:
1827 pte_unmap_unlock(pte, ptl);
1828out:
1829 return retval;
1830}
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1850 unsigned long pfn)
1851{
1852 return vm_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
1853}
1854EXPORT_SYMBOL(vm_insert_pfn);
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
1872 unsigned long pfn, pgprot_t pgprot)
1873{
1874 int ret;
1875
1876
1877
1878
1879
1880
1881 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
1882 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
1883 (VM_PFNMAP|VM_MIXEDMAP));
1884 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
1885 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
1886
1887 if (addr < vma->vm_start || addr >= vma->vm_end)
1888 return -EFAULT;
1889
1890 track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
1891
1892 ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
1893 false);
1894
1895 return ret;
1896}
1897EXPORT_SYMBOL(vm_insert_pfn_prot);
1898
1899static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
1900 pfn_t pfn, bool mkwrite)
1901{
1902 pgprot_t pgprot = vma->vm_page_prot;
1903
1904 BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
1905
1906 if (addr < vma->vm_start || addr >= vma->vm_end)
1907 return -EFAULT;
1908
1909 track_pfn_insert(vma, &pgprot, pfn);
1910
1911
1912
1913
1914
1915
1916
1917
1918 if (!HAVE_PTE_SPECIAL && !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) {
1919 struct page *page;
1920
1921
1922
1923
1924
1925
1926 page = pfn_to_page(pfn_t_to_pfn(pfn));
1927 return insert_page(vma, addr, page, pgprot);
1928 }
1929 return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
1930}
1931
1932int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
1933 pfn_t pfn)
1934{
1935 return __vm_insert_mixed(vma, addr, pfn, false);
1936
1937}
1938EXPORT_SYMBOL(vm_insert_mixed);
1939
1940int vm_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr,
1941 pfn_t pfn)
1942{
1943 return __vm_insert_mixed(vma, addr, pfn, true);
1944}
1945EXPORT_SYMBOL(vm_insert_mixed_mkwrite);
1946
1947
1948
1949
1950
1951
1952static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1953 unsigned long addr, unsigned long end,
1954 unsigned long pfn, pgprot_t prot)
1955{
1956 pte_t *pte;
1957 spinlock_t *ptl;
1958
1959 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
1960 if (!pte)
1961 return -ENOMEM;
1962 arch_enter_lazy_mmu_mode();
1963 do {
1964 BUG_ON(!pte_none(*pte));
1965 set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
1966 pfn++;
1967 } while (pte++, addr += PAGE_SIZE, addr != end);
1968 arch_leave_lazy_mmu_mode();
1969 pte_unmap_unlock(pte - 1, ptl);
1970 return 0;
1971}
1972
1973static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
1974 unsigned long addr, unsigned long end,
1975 unsigned long pfn, pgprot_t prot)
1976{
1977 pmd_t *pmd;
1978 unsigned long next;
1979
1980 pfn -= addr >> PAGE_SHIFT;
1981 pmd = pmd_alloc(mm, pud, addr);
1982 if (!pmd)
1983 return -ENOMEM;
1984 VM_BUG_ON(pmd_trans_huge(*pmd));
1985 do {
1986 next = pmd_addr_end(addr, end);
1987 if (remap_pte_range(mm, pmd, addr, next,
1988 pfn + (addr >> PAGE_SHIFT), prot))
1989 return -ENOMEM;
1990 } while (pmd++, addr = next, addr != end);
1991 return 0;
1992}
1993
1994static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d,
1995 unsigned long addr, unsigned long end,
1996 unsigned long pfn, pgprot_t prot)
1997{
1998 pud_t *pud;
1999 unsigned long next;
2000
2001 pfn -= addr >> PAGE_SHIFT;
2002 pud = pud_alloc(mm, p4d, addr);
2003 if (!pud)
2004 return -ENOMEM;
2005 do {
2006 next = pud_addr_end(addr, end);
2007 if (remap_pmd_range(mm, pud, addr, next,
2008 pfn + (addr >> PAGE_SHIFT), prot))
2009 return -ENOMEM;
2010 } while (pud++, addr = next, addr != end);
2011 return 0;
2012}
2013
2014static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
2015 unsigned long addr, unsigned long end,
2016 unsigned long pfn, pgprot_t prot)
2017{
2018 p4d_t *p4d;
2019 unsigned long next;
2020
2021 pfn -= addr >> PAGE_SHIFT;
2022 p4d = p4d_alloc(mm, pgd, addr);
2023 if (!p4d)
2024 return -ENOMEM;
2025 do {
2026 next = p4d_addr_end(addr, end);
2027 if (remap_pud_range(mm, p4d, addr, next,
2028 pfn + (addr >> PAGE_SHIFT), prot))
2029 return -ENOMEM;
2030 } while (p4d++, addr = next, addr != end);
2031 return 0;
2032}
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
2045 unsigned long pfn, unsigned long size, pgprot_t prot)
2046{
2047 pgd_t *pgd;
2048 unsigned long next;
2049 unsigned long end = addr + PAGE_ALIGN(size);
2050 struct mm_struct *mm = vma->vm_mm;
2051 unsigned long remap_pfn = pfn;
2052 int err;
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072 if (is_cow_mapping(vma->vm_flags)) {
2073 if (addr != vma->vm_start || end != vma->vm_end)
2074 return -EINVAL;
2075 vma->vm_pgoff = pfn;
2076 }
2077
2078 err = track_pfn_remap(vma, &prot, remap_pfn, addr, PAGE_ALIGN(size));
2079 if (err)
2080 return -EINVAL;
2081
2082 vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
2083
2084 BUG_ON(addr >= end);
2085 pfn -= addr >> PAGE_SHIFT;
2086 pgd = pgd_offset(mm, addr);
2087 flush_cache_range(vma, addr, end);
2088 do {
2089 next = pgd_addr_end(addr, end);
2090 err = remap_p4d_range(mm, pgd, addr, next,
2091 pfn + (addr >> PAGE_SHIFT), prot);
2092 if (err)
2093 break;
2094 } while (pgd++, addr = next, addr != end);
2095
2096 if (err)
2097 untrack_pfn(vma, remap_pfn, PAGE_ALIGN(size));
2098
2099 return err;
2100}
2101EXPORT_SYMBOL(remap_pfn_range);
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
2117{
2118 unsigned long vm_len, pfn, pages;
2119
2120
2121 if (start + len < start)
2122 return -EINVAL;
2123
2124
2125
2126
2127
2128 len += start & ~PAGE_MASK;
2129 pfn = start >> PAGE_SHIFT;
2130 pages = (len + ~PAGE_MASK) >> PAGE_SHIFT;
2131 if (pfn + pages < pfn)
2132 return -EINVAL;
2133
2134
2135 if (vma->vm_pgoff > pages)
2136 return -EINVAL;
2137 pfn += vma->vm_pgoff;
2138 pages -= vma->vm_pgoff;
2139
2140
2141 vm_len = vma->vm_end - vma->vm_start;
2142 if (vm_len >> PAGE_SHIFT > pages)
2143 return -EINVAL;
2144
2145
2146 return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
2147}
2148EXPORT_SYMBOL(vm_iomap_memory);
2149
2150static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
2151 unsigned long addr, unsigned long end,
2152 pte_fn_t fn, void *data)
2153{
2154 pte_t *pte;
2155 int err;
2156 pgtable_t token;
2157 spinlock_t *uninitialized_var(ptl);
2158
2159 pte = (mm == &init_mm) ?
2160 pte_alloc_kernel(pmd, addr) :
2161 pte_alloc_map_lock(mm, pmd, addr, &ptl);
2162 if (!pte)
2163 return -ENOMEM;
2164
2165 BUG_ON(pmd_huge(*pmd));
2166
2167 arch_enter_lazy_mmu_mode();
2168
2169 token = pmd_pgtable(*pmd);
2170
2171 do {
2172 err = fn(pte++, token, addr, data);
2173 if (err)
2174 break;
2175 } while (addr += PAGE_SIZE, addr != end);
2176
2177 arch_leave_lazy_mmu_mode();
2178
2179 if (mm != &init_mm)
2180 pte_unmap_unlock(pte-1, ptl);
2181 return err;
2182}
2183
2184static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
2185 unsigned long addr, unsigned long end,
2186 pte_fn_t fn, void *data)
2187{
2188 pmd_t *pmd;
2189 unsigned long next;
2190 int err;
2191
2192 BUG_ON(pud_huge(*pud));
2193
2194 pmd = pmd_alloc(mm, pud, addr);
2195 if (!pmd)
2196 return -ENOMEM;
2197 do {
2198 next = pmd_addr_end(addr, end);
2199 err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
2200 if (err)
2201 break;
2202 } while (pmd++, addr = next, addr != end);
2203 return err;
2204}
2205
2206static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
2207 unsigned long addr, unsigned long end,
2208 pte_fn_t fn, void *data)
2209{
2210 pud_t *pud;
2211 unsigned long next;
2212 int err;
2213
2214 pud = pud_alloc(mm, p4d, addr);
2215 if (!pud)
2216 return -ENOMEM;
2217 do {
2218 next = pud_addr_end(addr, end);
2219 err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
2220 if (err)
2221 break;
2222 } while (pud++, addr = next, addr != end);
2223 return err;
2224}
2225
2226static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
2227 unsigned long addr, unsigned long end,
2228 pte_fn_t fn, void *data)
2229{
2230 p4d_t *p4d;
2231 unsigned long next;
2232 int err;
2233
2234 p4d = p4d_alloc(mm, pgd, addr);
2235 if (!p4d)
2236 return -ENOMEM;
2237 do {
2238 next = p4d_addr_end(addr, end);
2239 err = apply_to_pud_range(mm, p4d, addr, next, fn, data);
2240 if (err)
2241 break;
2242 } while (p4d++, addr = next, addr != end);
2243 return err;
2244}
2245
2246
2247
2248
2249
2250int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
2251 unsigned long size, pte_fn_t fn, void *data)
2252{
2253 pgd_t *pgd;
2254 unsigned long next;
2255 unsigned long end = addr + size;
2256 int err;
2257
2258 if (WARN_ON(addr >= end))
2259 return -EINVAL;
2260
2261 pgd = pgd_offset(mm, addr);
2262 do {
2263 next = pgd_addr_end(addr, end);
2264 err = apply_to_p4d_range(mm, pgd, addr, next, fn, data);
2265 if (err)
2266 break;
2267 } while (pgd++, addr = next, addr != end);
2268
2269 return err;
2270}
2271EXPORT_SYMBOL_GPL(apply_to_page_range);
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
2282 pte_t *page_table, pte_t orig_pte)
2283{
2284 int same = 1;
2285#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
2286 if (sizeof(pte_t) > sizeof(unsigned long)) {
2287 spinlock_t *ptl = pte_lockptr(mm, pmd);
2288 spin_lock(ptl);
2289 same = pte_same(*page_table, orig_pte);
2290 spin_unlock(ptl);
2291 }
2292#endif
2293 pte_unmap(page_table);
2294 return same;
2295}
2296
2297static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
2298{
2299 debug_dma_assert_idle(src);
2300
2301
2302
2303
2304
2305
2306
2307 if (unlikely(!src)) {
2308 void *kaddr = kmap_atomic(dst);
2309 void __user *uaddr = (void __user *)(va & PAGE_MASK);
2310
2311
2312
2313
2314
2315
2316
2317 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
2318 clear_page(kaddr);
2319 kunmap_atomic(kaddr);
2320 flush_dcache_page(dst);
2321 } else
2322 copy_user_highpage(dst, src, va, vma);
2323}
2324
2325static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
2326{
2327 struct file *vm_file = vma->vm_file;
2328
2329 if (vm_file)
2330 return mapping_gfp_mask(vm_file->f_mapping) | __GFP_FS | __GFP_IO;
2331
2332
2333
2334
2335
2336 return GFP_KERNEL;
2337}
2338
2339
2340
2341
2342
2343
2344
2345static int do_page_mkwrite(struct vm_fault *vmf)
2346{
2347 int ret;
2348 struct page *page = vmf->page;
2349 unsigned int old_flags = vmf->flags;
2350
2351 vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
2352
2353 ret = vmf->vma->vm_ops->page_mkwrite(vmf);
2354
2355 vmf->flags = old_flags;
2356 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
2357 return ret;
2358 if (unlikely(!(ret & VM_FAULT_LOCKED))) {
2359 lock_page(page);
2360 if (!page->mapping) {
2361 unlock_page(page);
2362 return 0;
2363 }
2364 ret |= VM_FAULT_LOCKED;
2365 } else
2366 VM_BUG_ON_PAGE(!PageLocked(page), page);
2367 return ret;
2368}
2369
2370
2371
2372
2373
2374
2375static void fault_dirty_shared_page(struct vm_area_struct *vma,
2376 struct page *page)
2377{
2378 struct address_space *mapping;
2379 bool dirtied;
2380 bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;
2381
2382 dirtied = set_page_dirty(page);
2383 VM_BUG_ON_PAGE(PageAnon(page), page);
2384
2385
2386
2387
2388
2389
2390 mapping = page_rmapping(page);
2391 unlock_page(page);
2392
2393 if ((dirtied || page_mkwrite) && mapping) {
2394
2395
2396
2397
2398 balance_dirty_pages_ratelimited(mapping);
2399 }
2400
2401 if (!page_mkwrite)
2402 file_update_time(vma->vm_file);
2403}
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413static inline void wp_page_reuse(struct vm_fault *vmf)
2414 __releases(vmf->ptl)
2415{
2416 struct vm_area_struct *vma = vmf->vma;
2417 struct page *page = vmf->page;
2418 pte_t entry;
2419
2420
2421
2422
2423
2424 if (page)
2425 page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);
2426
2427 flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
2428 entry = pte_mkyoung(vmf->orig_pte);
2429 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2430 if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
2431 update_mmu_cache(vma, vmf->address, vmf->pte);
2432 pte_unmap_unlock(vmf->pte, vmf->ptl);
2433}
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451static int wp_page_copy(struct vm_fault *vmf)
2452{
2453 struct vm_area_struct *vma = vmf->vma;
2454 struct mm_struct *mm = vma->vm_mm;
2455 struct page *old_page = vmf->page;
2456 struct page *new_page = NULL;
2457 pte_t entry;
2458 int page_copied = 0;
2459 const unsigned long mmun_start = vmf->address & PAGE_MASK;
2460 const unsigned long mmun_end = mmun_start + PAGE_SIZE;
2461 struct mem_cgroup *memcg;
2462
2463 if (unlikely(anon_vma_prepare(vma)))
2464 goto oom;
2465
2466 if (is_zero_pfn(pte_pfn(vmf->orig_pte))) {
2467 new_page = alloc_zeroed_user_highpage_movable(vma,
2468 vmf->address);
2469 if (!new_page)
2470 goto oom;
2471 } else {
2472 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
2473 vmf->address);
2474 if (!new_page)
2475 goto oom;
2476 cow_user_page(new_page, old_page, vmf->address, vma);
2477 }
2478
2479 if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false))
2480 goto oom_free_new;
2481
2482 __SetPageUptodate(new_page);
2483
2484 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2485
2486
2487
2488
2489 vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
2490 if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
2491 if (old_page) {
2492 if (!PageAnon(old_page)) {
2493 dec_mm_counter_fast(mm,
2494 mm_counter_file(old_page));
2495 inc_mm_counter_fast(mm, MM_ANONPAGES);
2496 }
2497 } else {
2498 inc_mm_counter_fast(mm, MM_ANONPAGES);
2499 }
2500 flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
2501 entry = mk_pte(new_page, vma->vm_page_prot);
2502 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2503
2504
2505
2506
2507
2508
2509 ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
2510 page_add_new_anon_rmap(new_page, vma, vmf->address, false);
2511 mem_cgroup_commit_charge(new_page, memcg, false, false);
2512 lru_cache_add_active_or_unevictable(new_page, vma);
2513
2514
2515
2516
2517
2518 set_pte_at_notify(mm, vmf->address, vmf->pte, entry);
2519 update_mmu_cache(vma, vmf->address, vmf->pte);
2520 if (old_page) {
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543 page_remove_rmap(old_page, false);
2544 }
2545
2546
2547 new_page = old_page;
2548 page_copied = 1;
2549 } else {
2550 mem_cgroup_cancel_charge(new_page, memcg, false);
2551 }
2552
2553 if (new_page)
2554 put_page(new_page);
2555
2556 pte_unmap_unlock(vmf->pte, vmf->ptl);
2557 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2558 if (old_page) {
2559
2560
2561
2562
2563 if (page_copied && (vma->vm_flags & VM_LOCKED)) {
2564 lock_page(old_page);
2565 if (PageMlocked(old_page))
2566 munlock_vma_page(old_page);
2567 unlock_page(old_page);
2568 }
2569 put_page(old_page);
2570 }
2571 return page_copied ? VM_FAULT_WRITE : 0;
2572oom_free_new:
2573 put_page(new_page);
2574oom:
2575 if (old_page)
2576 put_page(old_page);
2577 return VM_FAULT_OOM;
2578}
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595int finish_mkwrite_fault(struct vm_fault *vmf)
2596{
2597 WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
2598 vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
2599 &vmf->ptl);
2600
2601
2602
2603
2604 if (!pte_same(*vmf->pte, vmf->orig_pte)) {
2605 pte_unmap_unlock(vmf->pte, vmf->ptl);
2606 return VM_FAULT_NOPAGE;
2607 }
2608 wp_page_reuse(vmf);
2609 return 0;
2610}
2611
2612
2613
2614
2615
2616static int wp_pfn_shared(struct vm_fault *vmf)
2617{
2618 struct vm_area_struct *vma = vmf->vma;
2619
2620 if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
2621 int ret;
2622
2623 pte_unmap_unlock(vmf->pte, vmf->ptl);
2624 vmf->flags |= FAULT_FLAG_MKWRITE;
2625 ret = vma->vm_ops->pfn_mkwrite(vmf);
2626 if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
2627 return ret;
2628 return finish_mkwrite_fault(vmf);
2629 }
2630 wp_page_reuse(vmf);
2631 return VM_FAULT_WRITE;
2632}
2633
2634static int wp_page_shared(struct vm_fault *vmf)
2635 __releases(vmf->ptl)
2636{
2637 struct vm_area_struct *vma = vmf->vma;
2638
2639 get_page(vmf->page);
2640
2641 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
2642 int tmp;
2643
2644 pte_unmap_unlock(vmf->pte, vmf->ptl);
2645 tmp = do_page_mkwrite(vmf);
2646 if (unlikely(!tmp || (tmp &
2647 (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
2648 put_page(vmf->page);
2649 return tmp;
2650 }
2651 tmp = finish_mkwrite_fault(vmf);
2652 if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
2653 unlock_page(vmf->page);
2654 put_page(vmf->page);
2655 return tmp;
2656 }
2657 } else {
2658 wp_page_reuse(vmf);
2659 lock_page(vmf->page);
2660 }
2661 fault_dirty_shared_page(vma, vmf->page);
2662 put_page(vmf->page);
2663
2664 return VM_FAULT_WRITE;
2665}
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685static int do_wp_page(struct vm_fault *vmf)
2686 __releases(vmf->ptl)
2687{
2688 struct vm_area_struct *vma = vmf->vma;
2689
2690 vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
2691 if (!vmf->page) {
2692
2693
2694
2695
2696
2697
2698
2699 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2700 (VM_WRITE|VM_SHARED))
2701 return wp_pfn_shared(vmf);
2702
2703 pte_unmap_unlock(vmf->pte, vmf->ptl);
2704 return wp_page_copy(vmf);
2705 }
2706
2707
2708
2709
2710
2711 if (PageAnon(vmf->page) && !PageKsm(vmf->page)) {
2712 int total_map_swapcount;
2713 if (!trylock_page(vmf->page)) {
2714 get_page(vmf->page);
2715 pte_unmap_unlock(vmf->pte, vmf->ptl);
2716 lock_page(vmf->page);
2717 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
2718 vmf->address, &vmf->ptl);
2719 if (!pte_same(*vmf->pte, vmf->orig_pte)) {
2720 unlock_page(vmf->page);
2721 pte_unmap_unlock(vmf->pte, vmf->ptl);
2722 put_page(vmf->page);
2723 return 0;
2724 }
2725 put_page(vmf->page);
2726 }
2727 if (reuse_swap_page(vmf->page, &total_map_swapcount)) {
2728 if (total_map_swapcount == 1) {
2729
2730
2731
2732
2733
2734
2735
2736 page_move_anon_rmap(vmf->page, vma);
2737 }
2738 unlock_page(vmf->page);
2739 wp_page_reuse(vmf);
2740 return VM_FAULT_WRITE;
2741 }
2742 unlock_page(vmf->page);
2743 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2744 (VM_WRITE|VM_SHARED))) {
2745 return wp_page_shared(vmf);
2746 }
2747
2748
2749
2750
2751 get_page(vmf->page);
2752
2753 pte_unmap_unlock(vmf->pte, vmf->ptl);
2754 return wp_page_copy(vmf);
2755}
2756
2757static void unmap_mapping_range_vma(struct vm_area_struct *vma,
2758 unsigned long start_addr, unsigned long end_addr,
2759 struct zap_details *details)
2760{
2761 zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
2762}
2763
2764static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
2765 struct zap_details *details)
2766{
2767 struct vm_area_struct *vma;
2768 pgoff_t vba, vea, zba, zea;
2769
2770 vma_interval_tree_foreach(vma, root,
2771 details->first_index, details->last_index) {
2772
2773 vba = vma->vm_pgoff;
2774 vea = vba + vma_pages(vma) - 1;
2775 zba = details->first_index;
2776 if (zba < vba)
2777 zba = vba;
2778 zea = details->last_index;
2779 if (zea > vea)
2780 zea = vea;
2781
2782 unmap_mapping_range_vma(vma,
2783 ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
2784 ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
2785 details);
2786 }
2787}
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806void unmap_mapping_range(struct address_space *mapping,
2807 loff_t const holebegin, loff_t const holelen, int even_cows)
2808{
2809 struct zap_details details = { };
2810 pgoff_t hba = holebegin >> PAGE_SHIFT;
2811 pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2812
2813
2814 if (sizeof(holelen) > sizeof(hlen)) {
2815 long long holeend =
2816 (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2817 if (holeend & ~(long long)ULONG_MAX)
2818 hlen = ULONG_MAX - hba + 1;
2819 }
2820
2821 details.check_mapping = even_cows ? NULL : mapping;
2822 details.first_index = hba;
2823 details.last_index = hba + hlen - 1;
2824 if (details.last_index < details.first_index)
2825 details.last_index = ULONG_MAX;
2826
2827 i_mmap_lock_write(mapping);
2828 if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
2829 unmap_mapping_range_tree(&mapping->i_mmap, &details);
2830 i_mmap_unlock_write(mapping);
2831}
2832EXPORT_SYMBOL(unmap_mapping_range);
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842int do_swap_page(struct vm_fault *vmf)
2843{
2844 struct vm_area_struct *vma = vmf->vma;
2845 struct page *page = NULL, *swapcache;
2846 struct mem_cgroup *memcg;
2847 struct vma_swap_readahead swap_ra;
2848 swp_entry_t entry;
2849 pte_t pte;
2850 int locked;
2851 int exclusive = 0;
2852 int ret = 0;
2853 bool vma_readahead = swap_use_vma_readahead();
2854
2855 if (vma_readahead)
2856 page = swap_readahead_detect(vmf, &swap_ra);
2857 if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) {
2858 if (page)
2859 put_page(page);
2860 goto out;
2861 }
2862
2863 entry = pte_to_swp_entry(vmf->orig_pte);
2864 if (unlikely(non_swap_entry(entry))) {
2865 if (is_migration_entry(entry)) {
2866 migration_entry_wait(vma->vm_mm, vmf->pmd,
2867 vmf->address);
2868 } else if (is_device_private_entry(entry)) {
2869
2870
2871
2872
2873
2874 ret = device_private_entry_fault(vma, vmf->address, entry,
2875 vmf->flags, vmf->pmd);
2876 } else if (is_hwpoison_entry(entry)) {
2877 ret = VM_FAULT_HWPOISON;
2878 } else {
2879 print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
2880 ret = VM_FAULT_SIGBUS;
2881 }
2882 goto out;
2883 }
2884 delayacct_set_flag(DELAYACCT_PF_SWAPIN);
2885 if (!page)
2886 page = lookup_swap_cache(entry, vma_readahead ? vma : NULL,
2887 vmf->address);
2888 if (!page) {
2889 if (vma_readahead)
2890 page = do_swap_page_readahead(entry,
2891 GFP_HIGHUSER_MOVABLE, vmf, &swap_ra);
2892 else
2893 page = swapin_readahead(entry,
2894 GFP_HIGHUSER_MOVABLE, vma, vmf->address);
2895 if (!page) {
2896
2897
2898
2899
2900 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
2901 vmf->address, &vmf->ptl);
2902 if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
2903 ret = VM_FAULT_OOM;
2904 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2905 goto unlock;
2906 }
2907
2908
2909 ret = VM_FAULT_MAJOR;
2910 count_vm_event(PGMAJFAULT);
2911 count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
2912 } else if (PageHWPoison(page)) {
2913
2914
2915
2916
2917 ret = VM_FAULT_HWPOISON;
2918 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2919 swapcache = page;
2920 goto out_release;
2921 }
2922
2923 swapcache = page;
2924 locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
2925
2926 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2927 if (!locked) {
2928 ret |= VM_FAULT_RETRY;
2929 goto out_release;
2930 }
2931
2932
2933
2934
2935
2936
2937
2938 if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
2939 goto out_page;
2940
2941 page = ksm_might_need_to_copy(page, vma, vmf->address);
2942 if (unlikely(!page)) {
2943 ret = VM_FAULT_OOM;
2944 page = swapcache;
2945 goto out_page;
2946 }
2947
2948 if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL,
2949 &memcg, false)) {
2950 ret = VM_FAULT_OOM;
2951 goto out_page;
2952 }
2953
2954
2955
2956
2957 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
2958 &vmf->ptl);
2959 if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte)))
2960 goto out_nomap;
2961
2962 if (unlikely(!PageUptodate(page))) {
2963 ret = VM_FAULT_SIGBUS;
2964 goto out_nomap;
2965 }
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
2978 dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
2979 pte = mk_pte(page, vma->vm_page_prot);
2980 if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
2981 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
2982 vmf->flags &= ~FAULT_FLAG_WRITE;
2983 ret |= VM_FAULT_WRITE;
2984 exclusive = RMAP_EXCLUSIVE;
2985 }
2986 flush_icache_page(vma, page);
2987 if (pte_swp_soft_dirty(vmf->orig_pte))
2988 pte = pte_mksoft_dirty(pte);
2989 set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
2990 vmf->orig_pte = pte;
2991 if (page == swapcache) {
2992 do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
2993 mem_cgroup_commit_charge(page, memcg, true, false);
2994 activate_page(page);
2995 } else {
2996 page_add_new_anon_rmap(page, vma, vmf->address, false);
2997 mem_cgroup_commit_charge(page, memcg, false, false);
2998 lru_cache_add_active_or_unevictable(page, vma);
2999 }
3000
3001 swap_free(entry);
3002 if (mem_cgroup_swap_full(page) ||
3003 (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
3004 try_to_free_swap(page);
3005 unlock_page(page);
3006 if (page != swapcache) {
3007
3008
3009
3010
3011
3012
3013
3014
3015 unlock_page(swapcache);
3016 put_page(swapcache);
3017 }
3018
3019 if (vmf->flags & FAULT_FLAG_WRITE) {
3020 ret |= do_wp_page(vmf);
3021 if (ret & VM_FAULT_ERROR)
3022 ret &= VM_FAULT_ERROR;
3023 goto out;
3024 }
3025
3026
3027 update_mmu_cache(vma, vmf->address, vmf->pte);
3028unlock:
3029 pte_unmap_unlock(vmf->pte, vmf->ptl);
3030out:
3031 return ret;
3032out_nomap:
3033 mem_cgroup_cancel_charge(page, memcg, false);
3034 pte_unmap_unlock(vmf->pte, vmf->ptl);
3035out_page:
3036 unlock_page(page);
3037out_release:
3038 put_page(page);
3039 if (page != swapcache) {
3040 unlock_page(swapcache);
3041 put_page(swapcache);
3042 }
3043 return ret;
3044}
3045
3046
3047
3048
3049
3050
3051static int do_anonymous_page(struct vm_fault *vmf)
3052{
3053 struct vm_area_struct *vma = vmf->vma;
3054 struct mem_cgroup *memcg;
3055 struct page *page;
3056 int ret = 0;
3057 pte_t entry;
3058
3059
3060 if (vma->vm_flags & VM_SHARED)
3061 return VM_FAULT_SIGBUS;
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073 if (pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))
3074 return VM_FAULT_OOM;
3075
3076
3077 if (unlikely(pmd_trans_unstable(vmf->pmd)))
3078 return 0;
3079
3080
3081 if (!(vmf->flags & FAULT_FLAG_WRITE) &&
3082 !mm_forbids_zeropage(vma->vm_mm)) {
3083 entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address),
3084 vma->vm_page_prot));
3085 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
3086 vmf->address, &vmf->ptl);
3087 if (!pte_none(*vmf->pte))
3088 goto unlock;
3089 ret = check_stable_address_space(vma->vm_mm);
3090 if (ret)
3091 goto unlock;
3092
3093 if (userfaultfd_missing(vma)) {
3094 pte_unmap_unlock(vmf->pte, vmf->ptl);
3095 return handle_userfault(vmf, VM_UFFD_MISSING);
3096 }
3097 goto setpte;
3098 }
3099
3100
3101 if (unlikely(anon_vma_prepare(vma)))
3102 goto oom;
3103 page = alloc_zeroed_user_highpage_movable(vma, vmf->address);
3104 if (!page)
3105 goto oom;
3106
3107 if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false))
3108 goto oom_free_page;
3109
3110
3111
3112
3113
3114
3115 __SetPageUptodate(page);
3116
3117 entry = mk_pte(page, vma->vm_page_prot);
3118 if (vma->vm_flags & VM_WRITE)
3119 entry = pte_mkwrite(pte_mkdirty(entry));
3120
3121 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
3122 &vmf->ptl);
3123 if (!pte_none(*vmf->pte))
3124 goto release;
3125
3126 ret = check_stable_address_space(vma->vm_mm);
3127 if (ret)
3128 goto release;
3129
3130
3131 if (userfaultfd_missing(vma)) {
3132 pte_unmap_unlock(vmf->pte, vmf->ptl);
3133 mem_cgroup_cancel_charge(page, memcg, false);
3134 put_page(page);
3135 return handle_userfault(vmf, VM_UFFD_MISSING);
3136 }
3137
3138 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
3139 page_add_new_anon_rmap(page, vma, vmf->address, false);
3140 mem_cgroup_commit_charge(page, memcg, false, false);
3141 lru_cache_add_active_or_unevictable(page, vma);
3142setpte:
3143 set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
3144
3145
3146 update_mmu_cache(vma, vmf->address, vmf->pte);
3147unlock:
3148 pte_unmap_unlock(vmf->pte, vmf->ptl);
3149 return ret;
3150release:
3151 mem_cgroup_cancel_charge(page, memcg, false);
3152 put_page(page);
3153 goto unlock;
3154oom_free_page:
3155 put_page(page);
3156oom:
3157 return VM_FAULT_OOM;
3158}
3159
3160
3161
3162
3163
3164
3165static int __do_fault(struct vm_fault *vmf)
3166{
3167 struct vm_area_struct *vma = vmf->vma;
3168 int ret;
3169
3170 ret = vma->vm_ops->fault(vmf);
3171 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
3172 VM_FAULT_DONE_COW)))
3173 return ret;
3174
3175 if (unlikely(PageHWPoison(vmf->page))) {
3176 if (ret & VM_FAULT_LOCKED)
3177 unlock_page(vmf->page);
3178 put_page(vmf->page);
3179 vmf->page = NULL;
3180 return VM_FAULT_HWPOISON;
3181 }
3182
3183 if (unlikely(!(ret & VM_FAULT_LOCKED)))
3184 lock_page(vmf->page);
3185 else
3186 VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page);
3187
3188 return ret;
3189}
3190
3191
3192
3193
3194
3195
3196
3197static int pmd_devmap_trans_unstable(pmd_t *pmd)
3198{
3199 return pmd_devmap(*pmd) || pmd_trans_unstable(pmd);
3200}
3201
3202static int pte_alloc_one_map(struct vm_fault *vmf)
3203{
3204 struct vm_area_struct *vma = vmf->vma;
3205
3206 if (!pmd_none(*vmf->pmd))
3207 goto map_pte;
3208 if (vmf->prealloc_pte) {
3209 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
3210 if (unlikely(!pmd_none(*vmf->pmd))) {
3211 spin_unlock(vmf->ptl);
3212 goto map_pte;
3213 }
3214
3215 atomic_long_inc(&vma->vm_mm->nr_ptes);
3216 pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
3217 spin_unlock(vmf->ptl);
3218 vmf->prealloc_pte = NULL;
3219 } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))) {
3220 return VM_FAULT_OOM;
3221 }
3222map_pte:
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234 if (pmd_devmap_trans_unstable(vmf->pmd))
3235 return VM_FAULT_NOPAGE;
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
3247 &vmf->ptl);
3248 return 0;
3249}
3250
3251#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
3252
3253#define HPAGE_CACHE_INDEX_MASK (HPAGE_PMD_NR - 1)
3254static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
3255 unsigned long haddr)
3256{
3257 if (((vma->vm_start >> PAGE_SHIFT) & HPAGE_CACHE_INDEX_MASK) !=
3258 (vma->vm_pgoff & HPAGE_CACHE_INDEX_MASK))
3259 return false;
3260 if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
3261 return false;
3262 return true;
3263}
3264
3265static void deposit_prealloc_pte(struct vm_fault *vmf)
3266{
3267 struct vm_area_struct *vma = vmf->vma;
3268
3269 pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
3270
3271
3272
3273
3274 atomic_long_inc(&vma->vm_mm->nr_ptes);
3275 vmf->prealloc_pte = NULL;
3276}
3277
3278static int do_set_pmd(struct vm_fault *vmf, struct page *page)
3279{
3280 struct vm_area_struct *vma = vmf->vma;
3281 bool write = vmf->flags & FAULT_FLAG_WRITE;
3282 unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
3283 pmd_t entry;
3284 int i, ret;
3285
3286 if (!transhuge_vma_suitable(vma, haddr))
3287 return VM_FAULT_FALLBACK;
3288
3289 ret = VM_FAULT_FALLBACK;
3290 page = compound_head(page);
3291
3292
3293
3294
3295
3296 if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
3297 vmf->prealloc_pte = pte_alloc_one(vma->vm_mm, vmf->address);
3298 if (!vmf->prealloc_pte)
3299 return VM_FAULT_OOM;
3300 smp_wmb();
3301 }
3302
3303 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
3304 if (unlikely(!pmd_none(*vmf->pmd)))
3305 goto out;
3306
3307 for (i = 0; i < HPAGE_PMD_NR; i++)
3308 flush_icache_page(vma, page + i);
3309
3310 entry = mk_huge_pmd(page, vma->vm_page_prot);
3311 if (write)
3312 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
3313
3314 add_mm_counter(vma->vm_mm, MM_FILEPAGES, HPAGE_PMD_NR);
3315 page_add_file_rmap(page, true);
3316
3317
3318
3319 if (arch_needs_pgtable_deposit())
3320 deposit_prealloc_pte(vmf);
3321
3322 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
3323
3324 update_mmu_cache_pmd(vma, haddr, vmf->pmd);
3325
3326
3327 ret = 0;
3328 count_vm_event(THP_FILE_MAPPED);
3329out:
3330 spin_unlock(vmf->ptl);
3331 return ret;
3332}
3333#else
3334static int do_set_pmd(struct vm_fault *vmf, struct page *page)
3335{
3336 BUILD_BUG();
3337 return 0;
3338}
3339#endif
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
3356 struct page *page)
3357{
3358 struct vm_area_struct *vma = vmf->vma;
3359 bool write = vmf->flags & FAULT_FLAG_WRITE;
3360 pte_t entry;
3361 int ret;
3362
3363 if (pmd_none(*vmf->pmd) && PageTransCompound(page) &&
3364 IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) {
3365
3366 VM_BUG_ON_PAGE(memcg, page);
3367
3368 ret = do_set_pmd(vmf, page);
3369 if (ret != VM_FAULT_FALLBACK)
3370 return ret;
3371 }
3372
3373 if (!vmf->pte) {
3374 ret = pte_alloc_one_map(vmf);
3375 if (ret)
3376 return ret;
3377 }
3378
3379
3380 if (unlikely(!pte_none(*vmf->pte)))
3381 return VM_FAULT_NOPAGE;
3382
3383 flush_icache_page(vma, page);
3384 entry = mk_pte(page, vma->vm_page_prot);
3385 if (write)
3386 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
3387
3388 if (write && !(vma->vm_flags & VM_SHARED)) {
3389 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
3390 page_add_new_anon_rmap(page, vma, vmf->address, false);
3391 mem_cgroup_commit_charge(page, memcg, false, false);
3392 lru_cache_add_active_or_unevictable(page, vma);
3393 } else {
3394 inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
3395 page_add_file_rmap(page, false);
3396 }
3397 set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
3398
3399
3400 update_mmu_cache(vma, vmf->address, vmf->pte);
3401
3402 return 0;
3403}
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420int finish_fault(struct vm_fault *vmf)
3421{
3422 struct page *page;
3423 int ret = 0;
3424
3425
3426 if ((vmf->flags & FAULT_FLAG_WRITE) &&
3427 !(vmf->vma->vm_flags & VM_SHARED))
3428 page = vmf->cow_page;
3429 else
3430 page = vmf->page;
3431
3432
3433
3434
3435
3436 if (!(vmf->vma->vm_flags & VM_SHARED))
3437 ret = check_stable_address_space(vmf->vma->vm_mm);
3438 if (!ret)
3439 ret = alloc_set_pte(vmf, vmf->memcg, page);
3440 if (vmf->pte)
3441 pte_unmap_unlock(vmf->pte, vmf->ptl);
3442 return ret;
3443}
3444
3445static unsigned long fault_around_bytes __read_mostly =
3446 rounddown_pow_of_two(65536);
3447
3448#ifdef CONFIG_DEBUG_FS
3449static int fault_around_bytes_get(void *data, u64 *val)
3450{
3451 *val = fault_around_bytes;
3452 return 0;
3453}
3454
3455
3456
3457
3458
3459
3460static int fault_around_bytes_set(void *data, u64 val)
3461{
3462 if (val / PAGE_SIZE > PTRS_PER_PTE)
3463 return -EINVAL;
3464 if (val > PAGE_SIZE)
3465 fault_around_bytes = rounddown_pow_of_two(val);
3466 else
3467 fault_around_bytes = PAGE_SIZE;
3468 return 0;
3469}
3470DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops,
3471 fault_around_bytes_get, fault_around_bytes_set, "%llu\n");
3472
3473static int __init fault_around_debugfs(void)
3474{
3475 void *ret;
3476
3477 ret = debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL,
3478 &fault_around_bytes_fops);
3479 if (!ret)
3480 pr_warn("Failed to create fault_around_bytes in debugfs");
3481 return 0;
3482}
3483late_initcall(fault_around_debugfs);
3484#endif
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509static int do_fault_around(struct vm_fault *vmf)
3510{
3511 unsigned long address = vmf->address, nr_pages, mask;
3512 pgoff_t start_pgoff = vmf->pgoff;
3513 pgoff_t end_pgoff;
3514 int off, ret = 0;
3515
3516 nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
3517 mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
3518
3519 vmf->address = max(address & mask, vmf->vma->vm_start);
3520 off = ((address - vmf->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
3521 start_pgoff -= off;
3522
3523
3524
3525
3526
3527 end_pgoff = start_pgoff -
3528 ((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
3529 PTRS_PER_PTE - 1;
3530 end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1,
3531 start_pgoff + nr_pages - 1);
3532
3533 if (pmd_none(*vmf->pmd)) {
3534 vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm,
3535 vmf->address);
3536 if (!vmf->prealloc_pte)
3537 goto out;
3538 smp_wmb();
3539 }
3540
3541 vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);
3542
3543
3544 if (pmd_trans_huge(*vmf->pmd)) {
3545 ret = VM_FAULT_NOPAGE;
3546 goto out;
3547 }
3548
3549
3550 if (!vmf->pte)
3551 goto out;
3552
3553
3554 vmf->pte -= (vmf->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT);
3555 if (!pte_none(*vmf->pte))
3556 ret = VM_FAULT_NOPAGE;
3557 pte_unmap_unlock(vmf->pte, vmf->ptl);
3558out:
3559 vmf->address = address;
3560 vmf->pte = NULL;
3561 return ret;
3562}
3563
3564static int do_read_fault(struct vm_fault *vmf)
3565{
3566 struct vm_area_struct *vma = vmf->vma;
3567 int ret = 0;
3568
3569
3570
3571
3572
3573
3574 if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
3575 ret = do_fault_around(vmf);
3576 if (ret)
3577 return ret;
3578 }
3579
3580 ret = __do_fault(vmf);
3581 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3582 return ret;
3583
3584 ret |= finish_fault(vmf);
3585 unlock_page(vmf->page);
3586 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3587 put_page(vmf->page);
3588 return ret;
3589}
3590
3591static int do_cow_fault(struct vm_fault *vmf)
3592{
3593 struct vm_area_struct *vma = vmf->vma;
3594 int ret;
3595
3596 if (unlikely(anon_vma_prepare(vma)))
3597 return VM_FAULT_OOM;
3598
3599 vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address);
3600 if (!vmf->cow_page)
3601 return VM_FAULT_OOM;
3602
3603 if (mem_cgroup_try_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL,
3604 &vmf->memcg, false)) {
3605 put_page(vmf->cow_page);
3606 return VM_FAULT_OOM;
3607 }
3608
3609 ret = __do_fault(vmf);
3610 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3611 goto uncharge_out;
3612 if (ret & VM_FAULT_DONE_COW)
3613 return ret;
3614
3615 copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma);
3616 __SetPageUptodate(vmf->cow_page);
3617
3618 ret |= finish_fault(vmf);
3619 unlock_page(vmf->page);
3620 put_page(vmf->page);
3621 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3622 goto uncharge_out;
3623 return ret;
3624uncharge_out:
3625 mem_cgroup_cancel_charge(vmf->cow_page, vmf->memcg, false);
3626 put_page(vmf->cow_page);
3627 return ret;
3628}
3629
3630static int do_shared_fault(struct vm_fault *vmf)
3631{
3632 struct vm_area_struct *vma = vmf->vma;
3633 int ret, tmp;
3634
3635 ret = __do_fault(vmf);
3636 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3637 return ret;
3638
3639
3640
3641
3642
3643 if (vma->vm_ops->page_mkwrite) {
3644 unlock_page(vmf->page);
3645 tmp = do_page_mkwrite(vmf);
3646 if (unlikely(!tmp ||
3647 (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
3648 put_page(vmf->page);
3649 return tmp;
3650 }
3651 }
3652
3653 ret |= finish_fault(vmf);
3654 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
3655 VM_FAULT_RETRY))) {
3656 unlock_page(vmf->page);
3657 put_page(vmf->page);
3658 return ret;
3659 }
3660
3661 fault_dirty_shared_page(vma, vmf->page);
3662 return ret;
3663}
3664
3665
3666
3667
3668
3669
3670
3671static int do_fault(struct vm_fault *vmf)
3672{
3673 struct vm_area_struct *vma = vmf->vma;
3674 int ret;
3675
3676
3677 if (!vma->vm_ops->fault)
3678 ret = VM_FAULT_SIGBUS;
3679 else if (!(vmf->flags & FAULT_FLAG_WRITE))
3680 ret = do_read_fault(vmf);
3681 else if (!(vma->vm_flags & VM_SHARED))
3682 ret = do_cow_fault(vmf);
3683 else
3684 ret = do_shared_fault(vmf);
3685
3686
3687 if (vmf->prealloc_pte) {
3688 pte_free(vma->vm_mm, vmf->prealloc_pte);
3689 vmf->prealloc_pte = NULL;
3690 }
3691 return ret;
3692}
3693
3694static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
3695 unsigned long addr, int page_nid,
3696 int *flags)
3697{
3698 get_page(page);
3699
3700 count_vm_numa_event(NUMA_HINT_FAULTS);
3701 if (page_nid == numa_node_id()) {
3702 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
3703 *flags |= TNF_FAULT_LOCAL;
3704 }
3705
3706 return mpol_misplaced(page, vma, addr);
3707}
3708
3709static int do_numa_page(struct vm_fault *vmf)
3710{
3711 struct vm_area_struct *vma = vmf->vma;
3712 struct page *page = NULL;
3713 int page_nid = -1;
3714 int last_cpupid;
3715 int target_nid;
3716 bool migrated = false;
3717 pte_t pte;
3718 bool was_writable = pte_savedwrite(vmf->orig_pte);
3719 int flags = 0;
3720
3721
3722
3723
3724
3725
3726 vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd);
3727 spin_lock(vmf->ptl);
3728 if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
3729 pte_unmap_unlock(vmf->pte, vmf->ptl);
3730 goto out;
3731 }
3732
3733
3734
3735
3736
3737 pte = ptep_modify_prot_start(vma->vm_mm, vmf->address, vmf->pte);
3738 pte = pte_modify(pte, vma->vm_page_prot);
3739 pte = pte_mkyoung(pte);
3740 if (was_writable)
3741 pte = pte_mkwrite(pte);
3742 ptep_modify_prot_commit(vma->vm_mm, vmf->address, vmf->pte, pte);
3743 update_mmu_cache(vma, vmf->address, vmf->pte);
3744
3745 page = vm_normal_page(vma, vmf->address, pte);
3746 if (!page) {
3747 pte_unmap_unlock(vmf->pte, vmf->ptl);
3748 return 0;
3749 }
3750
3751
3752 if (PageCompound(page)) {
3753 pte_unmap_unlock(vmf->pte, vmf->ptl);
3754 return 0;
3755 }
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765 if (!pte_write(pte))
3766 flags |= TNF_NO_GROUP;
3767
3768
3769
3770
3771
3772 if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
3773 flags |= TNF_SHARED;
3774
3775 last_cpupid = page_cpupid_last(page);
3776 page_nid = page_to_nid(page);
3777 target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
3778 &flags);
3779 pte_unmap_unlock(vmf->pte, vmf->ptl);
3780 if (target_nid == -1) {
3781 put_page(page);
3782 goto out;
3783 }
3784
3785
3786 migrated = migrate_misplaced_page(page, vma, target_nid);
3787 if (migrated) {
3788 page_nid = target_nid;
3789 flags |= TNF_MIGRATED;
3790 } else
3791 flags |= TNF_MIGRATE_FAIL;
3792
3793out:
3794 if (page_nid != -1)
3795 task_numa_fault(last_cpupid, page_nid, 1, flags);
3796 return 0;
3797}
3798
3799static inline int create_huge_pmd(struct vm_fault *vmf)
3800{
3801 if (vma_is_anonymous(vmf->vma))
3802 return do_huge_pmd_anonymous_page(vmf);
3803 if (vmf->vma->vm_ops->huge_fault)
3804 return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
3805 return VM_FAULT_FALLBACK;
3806}
3807
3808static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
3809{
3810 if (vma_is_anonymous(vmf->vma))
3811 return do_huge_pmd_wp_page(vmf, orig_pmd);
3812 if (vmf->vma->vm_ops->huge_fault)
3813 return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
3814
3815
3816 VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);
3817 __split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);
3818
3819 return VM_FAULT_FALLBACK;
3820}
3821
3822static inline bool vma_is_accessible(struct vm_area_struct *vma)
3823{
3824 return vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE);
3825}
3826
3827static int create_huge_pud(struct vm_fault *vmf)
3828{
3829#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3830
3831 if (vma_is_anonymous(vmf->vma))
3832 return VM_FAULT_FALLBACK;
3833 if (vmf->vma->vm_ops->huge_fault)
3834 return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
3835#endif
3836 return VM_FAULT_FALLBACK;
3837}
3838
3839static int wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
3840{
3841#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3842
3843 if (vma_is_anonymous(vmf->vma))
3844 return VM_FAULT_FALLBACK;
3845 if (vmf->vma->vm_ops->huge_fault)
3846 return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
3847#endif
3848 return VM_FAULT_FALLBACK;
3849}
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866static int handle_pte_fault(struct vm_fault *vmf)
3867{
3868 pte_t entry;
3869
3870 if (unlikely(pmd_none(*vmf->pmd))) {
3871
3872
3873
3874
3875
3876
3877 vmf->pte = NULL;
3878 } else {
3879
3880 if (pmd_devmap_trans_unstable(vmf->pmd))
3881 return 0;
3882
3883
3884
3885
3886
3887
3888 vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
3889 vmf->orig_pte = *vmf->pte;
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899 barrier();
3900 if (pte_none(vmf->orig_pte)) {
3901 pte_unmap(vmf->pte);
3902 vmf->pte = NULL;
3903 }
3904 }
3905
3906 if (!vmf->pte) {
3907 if (vma_is_anonymous(vmf->vma))
3908 return do_anonymous_page(vmf);
3909 else
3910 return do_fault(vmf);
3911 }
3912
3913 if (!pte_present(vmf->orig_pte))
3914 return do_swap_page(vmf);
3915
3916 if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
3917 return do_numa_page(vmf);
3918
3919 vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
3920 spin_lock(vmf->ptl);
3921 entry = vmf->orig_pte;
3922 if (unlikely(!pte_same(*vmf->pte, entry)))
3923 goto unlock;
3924 if (vmf->flags & FAULT_FLAG_WRITE) {
3925 if (!pte_write(entry))
3926 return do_wp_page(vmf);
3927 entry = pte_mkdirty(entry);
3928 }
3929 entry = pte_mkyoung(entry);
3930 if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
3931 vmf->flags & FAULT_FLAG_WRITE)) {
3932 update_mmu_cache(vmf->vma, vmf->address, vmf->pte);
3933 } else {
3934
3935
3936
3937
3938
3939
3940 if (vmf->flags & FAULT_FLAG_WRITE)
3941 flush_tlb_fix_spurious_fault(vmf->vma, vmf->address);
3942 }
3943unlock:
3944 pte_unmap_unlock(vmf->pte, vmf->ptl);
3945 return 0;
3946}
3947
3948
3949
3950
3951
3952
3953
3954static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
3955 unsigned int flags)
3956{
3957 struct vm_fault vmf = {
3958 .vma = vma,
3959 .address = address & PAGE_MASK,
3960 .flags = flags,
3961 .pgoff = linear_page_index(vma, address),
3962 .gfp_mask = __get_fault_gfp_mask(vma),
3963 };
3964 unsigned int dirty = flags & FAULT_FLAG_WRITE;
3965 struct mm_struct *mm = vma->vm_mm;
3966 pgd_t *pgd;
3967 p4d_t *p4d;
3968 int ret;
3969
3970 pgd = pgd_offset(mm, address);
3971 p4d = p4d_alloc(mm, pgd, address);
3972 if (!p4d)
3973 return VM_FAULT_OOM;
3974
3975 vmf.pud = pud_alloc(mm, p4d, address);
3976 if (!vmf.pud)
3977 return VM_FAULT_OOM;
3978 if (pud_none(*vmf.pud) && transparent_hugepage_enabled(vma)) {
3979 ret = create_huge_pud(&vmf);
3980 if (!(ret & VM_FAULT_FALLBACK))
3981 return ret;
3982 } else {
3983 pud_t orig_pud = *vmf.pud;
3984
3985 barrier();
3986 if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {
3987
3988
3989
3990 if (dirty && !pud_write(orig_pud)) {
3991 ret = wp_huge_pud(&vmf, orig_pud);
3992 if (!(ret & VM_FAULT_FALLBACK))
3993 return ret;
3994 } else {
3995 huge_pud_set_accessed(&vmf, orig_pud);
3996 return 0;
3997 }
3998 }
3999 }
4000
4001 vmf.pmd = pmd_alloc(mm, vmf.pud, address);
4002 if (!vmf.pmd)
4003 return VM_FAULT_OOM;
4004 if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) {
4005 ret = create_huge_pmd(&vmf);
4006 if (!(ret & VM_FAULT_FALLBACK))
4007 return ret;
4008 } else {
4009 pmd_t orig_pmd = *vmf.pmd;
4010
4011 barrier();
4012 if (unlikely(is_swap_pmd(orig_pmd))) {
4013 VM_BUG_ON(thp_migration_supported() &&
4014 !is_pmd_migration_entry(orig_pmd));
4015 if (is_pmd_migration_entry(orig_pmd))
4016 pmd_migration_entry_wait(mm, vmf.pmd);
4017 return 0;
4018 }
4019 if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
4020 if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
4021 return do_huge_pmd_numa_page(&vmf, orig_pmd);
4022
4023 if (dirty && !pmd_write(orig_pmd)) {
4024 ret = wp_huge_pmd(&vmf, orig_pmd);
4025 if (!(ret & VM_FAULT_FALLBACK))
4026 return ret;
4027 } else {
4028 huge_pmd_set_accessed(&vmf, orig_pmd);
4029 return 0;
4030 }
4031 }
4032 }
4033
4034 return handle_pte_fault(&vmf);
4035}
4036
4037
4038
4039
4040
4041
4042
4043int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
4044 unsigned int flags)
4045{
4046 int ret;
4047
4048 __set_current_state(TASK_RUNNING);
4049
4050 count_vm_event(PGFAULT);
4051 count_memcg_event_mm(vma->vm_mm, PGFAULT);
4052
4053
4054 check_sync_rss_stat(current);
4055
4056 if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
4057 flags & FAULT_FLAG_INSTRUCTION,
4058 flags & FAULT_FLAG_REMOTE))
4059 return VM_FAULT_SIGSEGV;
4060
4061
4062
4063
4064
4065 if (flags & FAULT_FLAG_USER)
4066 mem_cgroup_oom_enable();
4067
4068 if (unlikely(is_vm_hugetlb_page(vma)))
4069 ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
4070 else
4071 ret = __handle_mm_fault(vma, address, flags);
4072
4073 if (flags & FAULT_FLAG_USER) {
4074 mem_cgroup_oom_disable();
4075
4076
4077
4078
4079
4080
4081 if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
4082 mem_cgroup_oom_synchronize(false);
4083 }
4084
4085 return ret;
4086}
4087EXPORT_SYMBOL_GPL(handle_mm_fault);
4088
4089#ifndef __PAGETABLE_P4D_FOLDED
4090
4091
4092
4093
4094int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
4095{
4096 p4d_t *new = p4d_alloc_one(mm, address);
4097 if (!new)
4098 return -ENOMEM;
4099
4100 smp_wmb();
4101
4102 spin_lock(&mm->page_table_lock);
4103 if (pgd_present(*pgd))
4104 p4d_free(mm, new);
4105 else
4106 pgd_populate(mm, pgd, new);
4107 spin_unlock(&mm->page_table_lock);
4108 return 0;
4109}
4110#endif
4111
4112#ifndef __PAGETABLE_PUD_FOLDED
4113
4114
4115
4116
4117int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address)
4118{
4119 pud_t *new = pud_alloc_one(mm, address);
4120 if (!new)
4121 return -ENOMEM;
4122
4123 smp_wmb();
4124
4125 spin_lock(&mm->page_table_lock);
4126#ifndef __ARCH_HAS_5LEVEL_HACK
4127 if (p4d_present(*p4d))
4128 pud_free(mm, new);
4129 else
4130 p4d_populate(mm, p4d, new);
4131#else
4132 if (pgd_present(*p4d))
4133 pud_free(mm, new);
4134 else
4135 pgd_populate(mm, p4d, new);
4136#endif
4137 spin_unlock(&mm->page_table_lock);
4138 return 0;
4139}
4140#endif
4141
4142#ifndef __PAGETABLE_PMD_FOLDED
4143
4144
4145
4146
4147int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
4148{
4149 spinlock_t *ptl;
4150 pmd_t *new = pmd_alloc_one(mm, address);
4151 if (!new)
4152 return -ENOMEM;
4153
4154 smp_wmb();
4155
4156 ptl = pud_lock(mm, pud);
4157#ifndef __ARCH_HAS_4LEVEL_HACK
4158 if (!pud_present(*pud)) {
4159 mm_inc_nr_pmds(mm);
4160 pud_populate(mm, pud, new);
4161 } else
4162 pmd_free(mm, new);
4163#else
4164 if (!pgd_present(*pud)) {
4165 mm_inc_nr_pmds(mm);
4166 pgd_populate(mm, pud, new);
4167 } else
4168 pmd_free(mm, new);
4169#endif
4170 spin_unlock(ptl);
4171 return 0;
4172}
4173#endif
4174
4175static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
4176 unsigned long *start, unsigned long *end,
4177 pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
4178{
4179 pgd_t *pgd;
4180 p4d_t *p4d;
4181 pud_t *pud;
4182 pmd_t *pmd;
4183 pte_t *ptep;
4184
4185 pgd = pgd_offset(mm, address);
4186 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
4187 goto out;
4188
4189 p4d = p4d_offset(pgd, address);
4190 if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d)))
4191 goto out;
4192
4193 pud = pud_offset(p4d, address);
4194 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
4195 goto out;
4196
4197 pmd = pmd_offset(pud, address);
4198 VM_BUG_ON(pmd_trans_huge(*pmd));
4199
4200 if (pmd_huge(*pmd)) {
4201 if (!pmdpp)
4202 goto out;
4203
4204 if (start && end) {
4205 *start = address & PMD_MASK;
4206 *end = *start + PMD_SIZE;
4207 mmu_notifier_invalidate_range_start(mm, *start, *end);
4208 }
4209 *ptlp = pmd_lock(mm, pmd);
4210 if (pmd_huge(*pmd)) {
4211 *pmdpp = pmd;
4212 return 0;
4213 }
4214 spin_unlock(*ptlp);
4215 if (start && end)
4216 mmu_notifier_invalidate_range_end(mm, *start, *end);
4217 }
4218
4219 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
4220 goto out;
4221
4222 if (start && end) {
4223 *start = address & PAGE_MASK;
4224 *end = *start + PAGE_SIZE;
4225 mmu_notifier_invalidate_range_start(mm, *start, *end);
4226 }
4227 ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
4228 if (!pte_present(*ptep))
4229 goto unlock;
4230 *ptepp = ptep;
4231 return 0;
4232unlock:
4233 pte_unmap_unlock(ptep, *ptlp);
4234 if (start && end)
4235 mmu_notifier_invalidate_range_end(mm, *start, *end);
4236out:
4237 return -EINVAL;
4238}
4239
4240static inline int follow_pte(struct mm_struct *mm, unsigned long address,
4241 pte_t **ptepp, spinlock_t **ptlp)
4242{
4243 int res;
4244
4245
4246 (void) __cond_lock(*ptlp,
4247 !(res = __follow_pte_pmd(mm, address, NULL, NULL,
4248 ptepp, NULL, ptlp)));
4249 return res;
4250}
4251
4252int follow_pte_pmd(struct mm_struct *mm, unsigned long address,
4253 unsigned long *start, unsigned long *end,
4254 pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
4255{
4256 int res;
4257
4258
4259 (void) __cond_lock(*ptlp,
4260 !(res = __follow_pte_pmd(mm, address, start, end,
4261 ptepp, pmdpp, ptlp)));
4262 return res;
4263}
4264EXPORT_SYMBOL(follow_pte_pmd);
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276int follow_pfn(struct vm_area_struct *vma, unsigned long address,
4277 unsigned long *pfn)
4278{
4279 int ret = -EINVAL;
4280 spinlock_t *ptl;
4281 pte_t *ptep;
4282
4283 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
4284 return ret;
4285
4286 ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
4287 if (ret)
4288 return ret;
4289 *pfn = pte_pfn(*ptep);
4290 pte_unmap_unlock(ptep, ptl);
4291 return 0;
4292}
4293EXPORT_SYMBOL(follow_pfn);
4294
4295#ifdef CONFIG_HAVE_IOREMAP_PROT
4296int follow_phys(struct vm_area_struct *vma,
4297 unsigned long address, unsigned int flags,
4298 unsigned long *prot, resource_size_t *phys)
4299{
4300 int ret = -EINVAL;
4301 pte_t *ptep, pte;
4302 spinlock_t *ptl;
4303
4304 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
4305 goto out;
4306
4307 if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
4308 goto out;
4309 pte = *ptep;
4310
4311 if ((flags & FOLL_WRITE) && !pte_write(pte))
4312 goto unlock;
4313
4314 *prot = pgprot_val(pte_pgprot(pte));
4315 *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
4316
4317 ret = 0;
4318unlock:
4319 pte_unmap_unlock(ptep, ptl);
4320out:
4321 return ret;
4322}
4323
4324int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
4325 void *buf, int len, int write)
4326{
4327 resource_size_t phys_addr;
4328 unsigned long prot = 0;
4329 void __iomem *maddr;
4330 int offset = addr & (PAGE_SIZE-1);
4331
4332 if (follow_phys(vma, addr, write, &prot, &phys_addr))
4333 return -EINVAL;
4334
4335 maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
4336 if (write)
4337 memcpy_toio(maddr + offset, buf, len);
4338 else
4339 memcpy_fromio(buf, maddr + offset, len);
4340 iounmap(maddr);
4341
4342 return len;
4343}
4344EXPORT_SYMBOL_GPL(generic_access_phys);
4345#endif
4346
4347
4348
4349
4350
4351int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
4352 unsigned long addr, void *buf, int len, unsigned int gup_flags)
4353{
4354 struct vm_area_struct *vma;
4355 void *old_buf = buf;
4356 int write = gup_flags & FOLL_WRITE;
4357
4358 down_read(&mm->mmap_sem);
4359
4360 while (len) {
4361 int bytes, ret, offset;
4362 void *maddr;
4363 struct page *page = NULL;
4364
4365 ret = get_user_pages_remote(tsk, mm, addr, 1,
4366 gup_flags, &page, &vma, NULL);
4367 if (ret <= 0) {
4368#ifndef CONFIG_HAVE_IOREMAP_PROT
4369 break;
4370#else
4371
4372
4373
4374
4375 vma = find_vma(mm, addr);
4376 if (!vma || vma->vm_start > addr)
4377 break;
4378 if (vma->vm_ops && vma->vm_ops->access)
4379 ret = vma->vm_ops->access(vma, addr, buf,
4380 len, write);
4381 if (ret <= 0)
4382 break;
4383 bytes = ret;
4384#endif
4385 } else {
4386 bytes = len;
4387 offset = addr & (PAGE_SIZE-1);
4388 if (bytes > PAGE_SIZE-offset)
4389 bytes = PAGE_SIZE-offset;
4390
4391 maddr = kmap(page);
4392 if (write) {
4393 copy_to_user_page(vma, page, addr,
4394 maddr + offset, buf, bytes);
4395 set_page_dirty_lock(page);
4396 } else {
4397 copy_from_user_page(vma, page, addr,
4398 buf, maddr + offset, bytes);
4399 }
4400 kunmap(page);
4401 put_page(page);
4402 }
4403 len -= bytes;
4404 buf += bytes;
4405 addr += bytes;
4406 }
4407 up_read(&mm->mmap_sem);
4408
4409 return buf - old_buf;
4410}
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422int access_remote_vm(struct mm_struct *mm, unsigned long addr,
4423 void *buf, int len, unsigned int gup_flags)
4424{
4425 return __access_remote_vm(NULL, mm, addr, buf, len, gup_flags);
4426}
4427
4428
4429
4430
4431
4432
4433int access_process_vm(struct task_struct *tsk, unsigned long addr,
4434 void *buf, int len, unsigned int gup_flags)
4435{
4436 struct mm_struct *mm;
4437 int ret;
4438
4439 mm = get_task_mm(tsk);
4440 if (!mm)
4441 return 0;
4442
4443 ret = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags);
4444
4445 mmput(mm);
4446
4447 return ret;
4448}
4449EXPORT_SYMBOL_GPL(access_process_vm);
4450
4451
4452
4453
4454void print_vma_addr(char *prefix, unsigned long ip)
4455{
4456 struct mm_struct *mm = current->mm;
4457 struct vm_area_struct *vma;
4458
4459
4460
4461
4462
4463 if (preempt_count())
4464 return;
4465
4466 down_read(&mm->mmap_sem);
4467 vma = find_vma(mm, ip);
4468 if (vma && vma->vm_file) {
4469 struct file *f = vma->vm_file;
4470 char *buf = (char *)__get_free_page(GFP_KERNEL);
4471 if (buf) {
4472 char *p;
4473
4474 p = file_path(f, buf, PAGE_SIZE);
4475 if (IS_ERR(p))
4476 p = "?";
4477 printk("%s%s[%lx+%lx]", prefix, kbasename(p),
4478 vma->vm_start,
4479 vma->vm_end - vma->vm_start);
4480 free_page((unsigned long)buf);
4481 }
4482 }
4483 up_read(&mm->mmap_sem);
4484}
4485
4486#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
4487void __might_fault(const char *file, int line)
4488{
4489
4490
4491
4492
4493
4494
4495 if (uaccess_kernel())
4496 return;
4497 if (pagefault_disabled())
4498 return;
4499 __might_sleep(file, line, 0);
4500#if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
4501 if (current->mm)
4502 might_lock_read(¤t->mm->mmap_sem);
4503#endif
4504}
4505EXPORT_SYMBOL(__might_fault);
4506#endif
4507
4508#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
4509static void clear_gigantic_page(struct page *page,
4510 unsigned long addr,
4511 unsigned int pages_per_huge_page)
4512{
4513 int i;
4514 struct page *p = page;
4515
4516 might_sleep();
4517 for (i = 0; i < pages_per_huge_page;
4518 i++, p = mem_map_next(p, page, i)) {
4519 cond_resched();
4520 clear_user_highpage(p, addr + i * PAGE_SIZE);
4521 }
4522}
4523void clear_huge_page(struct page *page,
4524 unsigned long addr_hint, unsigned int pages_per_huge_page)
4525{
4526 int i, n, base, l;
4527 unsigned long addr = addr_hint &
4528 ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
4529
4530 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
4531 clear_gigantic_page(page, addr, pages_per_huge_page);
4532 return;
4533 }
4534
4535
4536 might_sleep();
4537 n = (addr_hint - addr) / PAGE_SIZE;
4538 if (2 * n <= pages_per_huge_page) {
4539
4540 base = 0;
4541 l = n;
4542
4543 for (i = pages_per_huge_page - 1; i >= 2 * n; i--) {
4544 cond_resched();
4545 clear_user_highpage(page + i, addr + i * PAGE_SIZE);
4546 }
4547 } else {
4548
4549 base = pages_per_huge_page - 2 * (pages_per_huge_page - n);
4550 l = pages_per_huge_page - n;
4551
4552 for (i = 0; i < base; i++) {
4553 cond_resched();
4554 clear_user_highpage(page + i, addr + i * PAGE_SIZE);
4555 }
4556 }
4557
4558
4559
4560
4561 for (i = 0; i < l; i++) {
4562 int left_idx = base + i;
4563 int right_idx = base + 2 * l - 1 - i;
4564
4565 cond_resched();
4566 clear_user_highpage(page + left_idx,
4567 addr + left_idx * PAGE_SIZE);
4568 cond_resched();
4569 clear_user_highpage(page + right_idx,
4570 addr + right_idx * PAGE_SIZE);
4571 }
4572}
4573
4574static void copy_user_gigantic_page(struct page *dst, struct page *src,
4575 unsigned long addr,
4576 struct vm_area_struct *vma,
4577 unsigned int pages_per_huge_page)
4578{
4579 int i;
4580 struct page *dst_base = dst;
4581 struct page *src_base = src;
4582
4583 for (i = 0; i < pages_per_huge_page; ) {
4584 cond_resched();
4585 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
4586
4587 i++;
4588 dst = mem_map_next(dst, dst_base, i);
4589 src = mem_map_next(src, src_base, i);
4590 }
4591}
4592
4593void copy_user_huge_page(struct page *dst, struct page *src,
4594 unsigned long addr, struct vm_area_struct *vma,
4595 unsigned int pages_per_huge_page)
4596{
4597 int i;
4598
4599 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
4600 copy_user_gigantic_page(dst, src, addr, vma,
4601 pages_per_huge_page);
4602 return;
4603 }
4604
4605 might_sleep();
4606 for (i = 0; i < pages_per_huge_page; i++) {
4607 cond_resched();
4608 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
4609 }
4610}
4611
4612long copy_huge_page_from_user(struct page *dst_page,
4613 const void __user *usr_src,
4614 unsigned int pages_per_huge_page,
4615 bool allow_pagefault)
4616{
4617 void *src = (void *)usr_src;
4618 void *page_kaddr;
4619 unsigned long i, rc = 0;
4620 unsigned long ret_val = pages_per_huge_page * PAGE_SIZE;
4621
4622 for (i = 0; i < pages_per_huge_page; i++) {
4623 if (allow_pagefault)
4624 page_kaddr = kmap(dst_page + i);
4625 else
4626 page_kaddr = kmap_atomic(dst_page + i);
4627 rc = copy_from_user(page_kaddr,
4628 (const void __user *)(src + i * PAGE_SIZE),
4629 PAGE_SIZE);
4630 if (allow_pagefault)
4631 kunmap(dst_page + i);
4632 else
4633 kunmap_atomic(page_kaddr);
4634
4635 ret_val -= (PAGE_SIZE - rc);
4636 if (rc)
4637 break;
4638
4639 cond_resched();
4640 }
4641 return ret_val;
4642}
4643#endif
4644
4645#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS
4646
4647static struct kmem_cache *page_ptl_cachep;
4648
4649void __init ptlock_cache_init(void)
4650{
4651 page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0,
4652 SLAB_PANIC, NULL);
4653}
4654
4655bool ptlock_alloc(struct page *page)
4656{
4657 spinlock_t *ptl;
4658
4659 ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
4660 if (!ptl)
4661 return false;
4662 page->ptl = ptl;
4663 return true;
4664}
4665
4666void ptlock_free(struct page *page)
4667{
4668 kmem_cache_free(page_ptl_cachep, page->ptl);
4669}
4670#endif
4671