1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41#include <linux/kernel_stat.h>
42#include <linux/mm.h>
43#include <linux/sched/mm.h>
44#include <linux/sched/coredump.h>
45#include <linux/sched/numa_balancing.h>
46#include <linux/sched/task.h>
47#include <linux/hugetlb.h>
48#include <linux/mman.h>
49#include <linux/swap.h>
50#include <linux/highmem.h>
51#include <linux/pagemap.h>
52#include <linux/memremap.h>
53#include <linux/ksm.h>
54#include <linux/rmap.h>
55#include <linux/export.h>
56#include <linux/delayacct.h>
57#include <linux/init.h>
58#include <linux/pfn_t.h>
59#include <linux/writeback.h>
60#include <linux/memcontrol.h>
61#include <linux/mmu_notifier.h>
62#include <linux/swapops.h>
63#include <linux/elf.h>
64#include <linux/gfp.h>
65#include <linux/migrate.h>
66#include <linux/string.h>
67#include <linux/dma-debug.h>
68#include <linux/debugfs.h>
69#include <linux/userfaultfd_k.h>
70#include <linux/dax.h>
71#include <linux/oom.h>
72
73#include <asm/io.h>
74#include <asm/mmu_context.h>
75#include <asm/pgalloc.h>
76#include <linux/uaccess.h>
77#include <asm/tlb.h>
78#include <asm/tlbflush.h>
79#include <asm/pgtable.h>
80
81#include "internal.h"
82
83#if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST)
84#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
85#endif
86
87#ifndef CONFIG_NEED_MULTIPLE_NODES
88
89unsigned long max_mapnr;
90EXPORT_SYMBOL(max_mapnr);
91
92struct page *mem_map;
93EXPORT_SYMBOL(mem_map);
94#endif
95
96
97
98
99
100
101
102
103void *high_memory;
104EXPORT_SYMBOL(high_memory);
105
106
107
108
109
110
111
112int randomize_va_space __read_mostly =
113#ifdef CONFIG_COMPAT_BRK
114 1;
115#else
116 2;
117#endif
118
119static int __init disable_randmaps(char *s)
120{
121 randomize_va_space = 0;
122 return 1;
123}
124__setup("norandmaps", disable_randmaps);
125
126unsigned long zero_pfn __read_mostly;
127EXPORT_SYMBOL(zero_pfn);
128
129unsigned long highest_memmap_pfn __read_mostly;
130
131
132
133
134static int __init init_zero_pfn(void)
135{
136 zero_pfn = page_to_pfn(ZERO_PAGE(0));
137 return 0;
138}
139core_initcall(init_zero_pfn);
140
141
142#if defined(SPLIT_RSS_COUNTING)
143
144void sync_mm_rss(struct mm_struct *mm)
145{
146 int i;
147
148 for (i = 0; i < NR_MM_COUNTERS; i++) {
149 if (current->rss_stat.count[i]) {
150 add_mm_counter(mm, i, current->rss_stat.count[i]);
151 current->rss_stat.count[i] = 0;
152 }
153 }
154 current->rss_stat.events = 0;
155}
156
157static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
158{
159 struct task_struct *task = current;
160
161 if (likely(task->mm == mm))
162 task->rss_stat.count[member] += val;
163 else
164 add_mm_counter(mm, member, val);
165}
166#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
167#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
168
169
170#define TASK_RSS_EVENTS_THRESH (64)
171static void check_sync_rss_stat(struct task_struct *task)
172{
173 if (unlikely(task != current))
174 return;
175 if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
176 sync_mm_rss(task->mm);
177}
178#else
179
180#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
181#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
182
183static void check_sync_rss_stat(struct task_struct *task)
184{
185}
186
187#endif
188
189#ifdef HAVE_GENERIC_MMU_GATHER
190
191static bool tlb_next_batch(struct mmu_gather *tlb)
192{
193 struct mmu_gather_batch *batch;
194
195 batch = tlb->active;
196 if (batch->next) {
197 tlb->active = batch->next;
198 return true;
199 }
200
201 if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
202 return false;
203
204 batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
205 if (!batch)
206 return false;
207
208 tlb->batch_count++;
209 batch->next = NULL;
210 batch->nr = 0;
211 batch->max = MAX_GATHER_BATCH;
212
213 tlb->active->next = batch;
214 tlb->active = batch;
215
216 return true;
217}
218
219void arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
220 unsigned long start, unsigned long end)
221{
222 tlb->mm = mm;
223
224
225 tlb->fullmm = !(start | (end+1));
226 tlb->need_flush_all = 0;
227 tlb->local.next = NULL;
228 tlb->local.nr = 0;
229 tlb->local.max = ARRAY_SIZE(tlb->__pages);
230 tlb->active = &tlb->local;
231 tlb->batch_count = 0;
232
233#ifdef CONFIG_HAVE_RCU_TABLE_FREE
234 tlb->batch = NULL;
235#endif
236 tlb->page_size = 0;
237
238 __tlb_reset_range(tlb);
239}
240
241static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
242{
243 if (!tlb->end)
244 return;
245
246 tlb_flush(tlb);
247 mmu_notifier_invalidate_range(tlb->mm, tlb->start, tlb->end);
248#ifdef CONFIG_HAVE_RCU_TABLE_FREE
249 tlb_table_flush(tlb);
250#endif
251 __tlb_reset_range(tlb);
252}
253
254static void tlb_flush_mmu_free(struct mmu_gather *tlb)
255{
256 struct mmu_gather_batch *batch;
257
258 for (batch = &tlb->local; batch && batch->nr; batch = batch->next) {
259 free_pages_and_swap_cache(batch->pages, batch->nr);
260 batch->nr = 0;
261 }
262 tlb->active = &tlb->local;
263}
264
265void tlb_flush_mmu(struct mmu_gather *tlb)
266{
267 tlb_flush_mmu_tlbonly(tlb);
268 tlb_flush_mmu_free(tlb);
269}
270
271
272
273
274
275void arch_tlb_finish_mmu(struct mmu_gather *tlb,
276 unsigned long start, unsigned long end, bool force)
277{
278 struct mmu_gather_batch *batch, *next;
279
280 if (force)
281 __tlb_adjust_range(tlb, start, end - start);
282
283 tlb_flush_mmu(tlb);
284
285
286 check_pgt_cache();
287
288 for (batch = tlb->local.next; batch; batch = next) {
289 next = batch->next;
290 free_pages((unsigned long)batch, 0);
291 }
292 tlb->local.next = NULL;
293}
294
295
296
297
298
299
300
301
302bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size)
303{
304 struct mmu_gather_batch *batch;
305
306 VM_BUG_ON(!tlb->end);
307 VM_WARN_ON(tlb->page_size != page_size);
308
309 batch = tlb->active;
310
311
312
313
314 batch->pages[batch->nr++] = page;
315 if (batch->nr == batch->max) {
316 if (!tlb_next_batch(tlb))
317 return true;
318 batch = tlb->active;
319 }
320 VM_BUG_ON_PAGE(batch->nr > batch->max, page);
321
322 return false;
323}
324
325#endif
326
327#ifdef CONFIG_HAVE_RCU_TABLE_FREE
328
329
330
331
332
333static void tlb_remove_table_smp_sync(void *arg)
334{
335
336}
337
338static void tlb_remove_table_one(void *table)
339{
340
341
342
343
344
345
346
347 smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
348 __tlb_remove_table(table);
349}
350
351static void tlb_remove_table_rcu(struct rcu_head *head)
352{
353 struct mmu_table_batch *batch;
354 int i;
355
356 batch = container_of(head, struct mmu_table_batch, rcu);
357
358 for (i = 0; i < batch->nr; i++)
359 __tlb_remove_table(batch->tables[i]);
360
361 free_page((unsigned long)batch);
362}
363
364void tlb_table_flush(struct mmu_gather *tlb)
365{
366 struct mmu_table_batch **batch = &tlb->batch;
367
368 if (*batch) {
369 call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
370 *batch = NULL;
371 }
372}
373
374void tlb_remove_table(struct mmu_gather *tlb, void *table)
375{
376 struct mmu_table_batch **batch = &tlb->batch;
377
378
379
380
381
382 if (atomic_read(&tlb->mm->mm_users) < 2) {
383 __tlb_remove_table(table);
384 return;
385 }
386
387 if (*batch == NULL) {
388 *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
389 if (*batch == NULL) {
390 tlb_remove_table_one(table);
391 return;
392 }
393 (*batch)->nr = 0;
394 }
395 (*batch)->tables[(*batch)->nr++] = table;
396 if ((*batch)->nr == MAX_TABLE_BATCH)
397 tlb_table_flush(tlb);
398}
399
400#endif
401
402
403
404
405
406
407
408
409
410
411
412
413
414void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
415 unsigned long start, unsigned long end)
416{
417 arch_tlb_gather_mmu(tlb, mm, start, end);
418 inc_tlb_flush_pending(tlb->mm);
419}
420
421void tlb_finish_mmu(struct mmu_gather *tlb,
422 unsigned long start, unsigned long end)
423{
424
425
426
427
428
429
430
431 bool force = mm_tlb_flush_nested(tlb->mm);
432
433 arch_tlb_finish_mmu(tlb, start, end, force);
434 dec_tlb_flush_pending(tlb->mm);
435}
436
437
438
439
440
441static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
442 unsigned long addr)
443{
444 pgtable_t token = pmd_pgtable(*pmd);
445 pmd_clear(pmd);
446 pte_free_tlb(tlb, token, addr);
447 mm_dec_nr_ptes(tlb->mm);
448}
449
450static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
451 unsigned long addr, unsigned long end,
452 unsigned long floor, unsigned long ceiling)
453{
454 pmd_t *pmd;
455 unsigned long next;
456 unsigned long start;
457
458 start = addr;
459 pmd = pmd_offset(pud, addr);
460 do {
461 next = pmd_addr_end(addr, end);
462 if (pmd_none_or_clear_bad(pmd))
463 continue;
464 free_pte_range(tlb, pmd, addr);
465 } while (pmd++, addr = next, addr != end);
466
467 start &= PUD_MASK;
468 if (start < floor)
469 return;
470 if (ceiling) {
471 ceiling &= PUD_MASK;
472 if (!ceiling)
473 return;
474 }
475 if (end - 1 > ceiling - 1)
476 return;
477
478 pmd = pmd_offset(pud, start);
479 pud_clear(pud);
480 pmd_free_tlb(tlb, pmd, start);
481 mm_dec_nr_pmds(tlb->mm);
482}
483
484static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
485 unsigned long addr, unsigned long end,
486 unsigned long floor, unsigned long ceiling)
487{
488 pud_t *pud;
489 unsigned long next;
490 unsigned long start;
491
492 start = addr;
493 pud = pud_offset(p4d, addr);
494 do {
495 next = pud_addr_end(addr, end);
496 if (pud_none_or_clear_bad(pud))
497 continue;
498 free_pmd_range(tlb, pud, addr, next, floor, ceiling);
499 } while (pud++, addr = next, addr != end);
500
501 start &= P4D_MASK;
502 if (start < floor)
503 return;
504 if (ceiling) {
505 ceiling &= P4D_MASK;
506 if (!ceiling)
507 return;
508 }
509 if (end - 1 > ceiling - 1)
510 return;
511
512 pud = pud_offset(p4d, start);
513 p4d_clear(p4d);
514 pud_free_tlb(tlb, pud, start);
515 mm_dec_nr_puds(tlb->mm);
516}
517
518static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
519 unsigned long addr, unsigned long end,
520 unsigned long floor, unsigned long ceiling)
521{
522 p4d_t *p4d;
523 unsigned long next;
524 unsigned long start;
525
526 start = addr;
527 p4d = p4d_offset(pgd, addr);
528 do {
529 next = p4d_addr_end(addr, end);
530 if (p4d_none_or_clear_bad(p4d))
531 continue;
532 free_pud_range(tlb, p4d, addr, next, floor, ceiling);
533 } while (p4d++, addr = next, addr != end);
534
535 start &= PGDIR_MASK;
536 if (start < floor)
537 return;
538 if (ceiling) {
539 ceiling &= PGDIR_MASK;
540 if (!ceiling)
541 return;
542 }
543 if (end - 1 > ceiling - 1)
544 return;
545
546 p4d = p4d_offset(pgd, start);
547 pgd_clear(pgd);
548 p4d_free_tlb(tlb, p4d, start);
549}
550
551
552
553
554void free_pgd_range(struct mmu_gather *tlb,
555 unsigned long addr, unsigned long end,
556 unsigned long floor, unsigned long ceiling)
557{
558 pgd_t *pgd;
559 unsigned long next;
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587 addr &= PMD_MASK;
588 if (addr < floor) {
589 addr += PMD_SIZE;
590 if (!addr)
591 return;
592 }
593 if (ceiling) {
594 ceiling &= PMD_MASK;
595 if (!ceiling)
596 return;
597 }
598 if (end - 1 > ceiling - 1)
599 end -= PMD_SIZE;
600 if (addr > end - 1)
601 return;
602
603
604
605
606 tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
607 pgd = pgd_offset(tlb->mm, addr);
608 do {
609 next = pgd_addr_end(addr, end);
610 if (pgd_none_or_clear_bad(pgd))
611 continue;
612 free_p4d_range(tlb, pgd, addr, next, floor, ceiling);
613 } while (pgd++, addr = next, addr != end);
614}
615
616void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
617 unsigned long floor, unsigned long ceiling)
618{
619 while (vma) {
620 struct vm_area_struct *next = vma->vm_next;
621 unsigned long addr = vma->vm_start;
622
623
624
625
626
627 unlink_anon_vmas(vma);
628 unlink_file_vma(vma);
629
630 if (is_vm_hugetlb_page(vma)) {
631 hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
632 floor, next ? next->vm_start : ceiling);
633 } else {
634
635
636
637 while (next && next->vm_start <= vma->vm_end + PMD_SIZE
638 && !is_vm_hugetlb_page(next)) {
639 vma = next;
640 next = vma->vm_next;
641 unlink_anon_vmas(vma);
642 unlink_file_vma(vma);
643 }
644 free_pgd_range(tlb, addr, vma->vm_end,
645 floor, next ? next->vm_start : ceiling);
646 }
647 vma = next;
648 }
649}
650
651int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
652{
653 spinlock_t *ptl;
654 pgtable_t new = pte_alloc_one(mm, address);
655 if (!new)
656 return -ENOMEM;
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671 smp_wmb();
672
673 ptl = pmd_lock(mm, pmd);
674 if (likely(pmd_none(*pmd))) {
675 mm_inc_nr_ptes(mm);
676 pmd_populate(mm, pmd, new);
677 new = NULL;
678 }
679 spin_unlock(ptl);
680 if (new)
681 pte_free(mm, new);
682 return 0;
683}
684
685int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
686{
687 pte_t *new = pte_alloc_one_kernel(&init_mm, address);
688 if (!new)
689 return -ENOMEM;
690
691 smp_wmb();
692
693 spin_lock(&init_mm.page_table_lock);
694 if (likely(pmd_none(*pmd))) {
695 pmd_populate_kernel(&init_mm, pmd, new);
696 new = NULL;
697 }
698 spin_unlock(&init_mm.page_table_lock);
699 if (new)
700 pte_free_kernel(&init_mm, new);
701 return 0;
702}
703
704static inline void init_rss_vec(int *rss)
705{
706 memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
707}
708
709static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
710{
711 int i;
712
713 if (current->mm == mm)
714 sync_mm_rss(mm);
715 for (i = 0; i < NR_MM_COUNTERS; i++)
716 if (rss[i])
717 add_mm_counter(mm, i, rss[i]);
718}
719
720
721
722
723
724
725
726
727static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
728 pte_t pte, struct page *page)
729{
730 pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
731 p4d_t *p4d = p4d_offset(pgd, addr);
732 pud_t *pud = pud_offset(p4d, addr);
733 pmd_t *pmd = pmd_offset(pud, addr);
734 struct address_space *mapping;
735 pgoff_t index;
736 static unsigned long resume;
737 static unsigned long nr_shown;
738 static unsigned long nr_unshown;
739
740
741
742
743
744 if (nr_shown == 60) {
745 if (time_before(jiffies, resume)) {
746 nr_unshown++;
747 return;
748 }
749 if (nr_unshown) {
750 pr_alert("BUG: Bad page map: %lu messages suppressed\n",
751 nr_unshown);
752 nr_unshown = 0;
753 }
754 nr_shown = 0;
755 }
756 if (nr_shown++ == 0)
757 resume = jiffies + 60 * HZ;
758
759 mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
760 index = linear_page_index(vma, addr);
761
762 pr_alert("BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
763 current->comm,
764 (long long)pte_val(pte), (long long)pmd_val(*pmd));
765 if (page)
766 dump_page(page, "bad pte");
767 pr_alert("addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
768 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
769 pr_alert("file:%pD fault:%pf mmap:%pf readpage:%pf\n",
770 vma->vm_file,
771 vma->vm_ops ? vma->vm_ops->fault : NULL,
772 vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
773 mapping ? mapping->a_ops->readpage : NULL);
774 dump_stack();
775 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
776}
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820#ifdef __HAVE_ARCH_PTE_SPECIAL
821# define HAVE_PTE_SPECIAL 1
822#else
823# define HAVE_PTE_SPECIAL 0
824#endif
825struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
826 pte_t pte, bool with_public_device)
827{
828 unsigned long pfn = pte_pfn(pte);
829
830 if (HAVE_PTE_SPECIAL) {
831 if (likely(!pte_special(pte)))
832 goto check_pfn;
833 if (vma->vm_ops && vma->vm_ops->find_special_page)
834 return vma->vm_ops->find_special_page(vma, addr);
835 if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
836 return NULL;
837 if (is_zero_pfn(pfn))
838 return NULL;
839
840
841
842
843
844
845
846
847
848
849
850
851
852 if (likely(pfn <= highest_memmap_pfn)) {
853 struct page *page = pfn_to_page(pfn);
854
855 if (is_device_public_page(page)) {
856 if (with_public_device)
857 return page;
858 return NULL;
859 }
860 }
861 print_bad_pte(vma, addr, pte, NULL);
862 return NULL;
863 }
864
865
866
867 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
868 if (vma->vm_flags & VM_MIXEDMAP) {
869 if (!pfn_valid(pfn))
870 return NULL;
871 goto out;
872 } else {
873 unsigned long off;
874 off = (addr - vma->vm_start) >> PAGE_SHIFT;
875 if (pfn == vma->vm_pgoff + off)
876 return NULL;
877 if (!is_cow_mapping(vma->vm_flags))
878 return NULL;
879 }
880 }
881
882 if (is_zero_pfn(pfn))
883 return NULL;
884check_pfn:
885 if (unlikely(pfn > highest_memmap_pfn)) {
886 print_bad_pte(vma, addr, pte, NULL);
887 return NULL;
888 }
889
890
891
892
893
894out:
895 return pfn_to_page(pfn);
896}
897
898#ifdef CONFIG_TRANSPARENT_HUGEPAGE
899struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
900 pmd_t pmd)
901{
902 unsigned long pfn = pmd_pfn(pmd);
903
904
905
906
907
908
909 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
910 if (vma->vm_flags & VM_MIXEDMAP) {
911 if (!pfn_valid(pfn))
912 return NULL;
913 goto out;
914 } else {
915 unsigned long off;
916 off = (addr - vma->vm_start) >> PAGE_SHIFT;
917 if (pfn == vma->vm_pgoff + off)
918 return NULL;
919 if (!is_cow_mapping(vma->vm_flags))
920 return NULL;
921 }
922 }
923
924 if (is_zero_pfn(pfn))
925 return NULL;
926 if (unlikely(pfn > highest_memmap_pfn))
927 return NULL;
928
929
930
931
932
933out:
934 return pfn_to_page(pfn);
935}
936#endif
937
938
939
940
941
942
943
944static inline unsigned long
945copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
946 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
947 unsigned long addr, int *rss)
948{
949 unsigned long vm_flags = vma->vm_flags;
950 pte_t pte = *src_pte;
951 struct page *page;
952
953
954 if (unlikely(!pte_present(pte))) {
955 swp_entry_t entry = pte_to_swp_entry(pte);
956
957 if (likely(!non_swap_entry(entry))) {
958 if (swap_duplicate(entry) < 0)
959 return entry.val;
960
961
962 if (unlikely(list_empty(&dst_mm->mmlist))) {
963 spin_lock(&mmlist_lock);
964 if (list_empty(&dst_mm->mmlist))
965 list_add(&dst_mm->mmlist,
966 &src_mm->mmlist);
967 spin_unlock(&mmlist_lock);
968 }
969 rss[MM_SWAPENTS]++;
970 } else if (is_migration_entry(entry)) {
971 page = migration_entry_to_page(entry);
972
973 rss[mm_counter(page)]++;
974
975 if (is_write_migration_entry(entry) &&
976 is_cow_mapping(vm_flags)) {
977
978
979
980
981 make_migration_entry_read(&entry);
982 pte = swp_entry_to_pte(entry);
983 if (pte_swp_soft_dirty(*src_pte))
984 pte = pte_swp_mksoft_dirty(pte);
985 set_pte_at(src_mm, addr, src_pte, pte);
986 }
987 } else if (is_device_private_entry(entry)) {
988 page = device_private_entry_to_page(entry);
989
990
991
992
993
994
995
996
997
998
999 get_page(page);
1000 rss[mm_counter(page)]++;
1001 page_dup_rmap(page, false);
1002
1003
1004
1005
1006
1007
1008
1009
1010 if (is_write_device_private_entry(entry) &&
1011 is_cow_mapping(vm_flags)) {
1012 make_device_private_entry_read(&entry);
1013 pte = swp_entry_to_pte(entry);
1014 set_pte_at(src_mm, addr, src_pte, pte);
1015 }
1016 }
1017 goto out_set_pte;
1018 }
1019
1020
1021
1022
1023
1024 if (is_cow_mapping(vm_flags)) {
1025 ptep_set_wrprotect(src_mm, addr, src_pte);
1026 pte = pte_wrprotect(pte);
1027 }
1028
1029
1030
1031
1032
1033 if (vm_flags & VM_SHARED)
1034 pte = pte_mkclean(pte);
1035 pte = pte_mkold(pte);
1036
1037 page = vm_normal_page(vma, addr, pte);
1038 if (page) {
1039 get_page(page);
1040 page_dup_rmap(page, false);
1041 rss[mm_counter(page)]++;
1042 } else if (pte_devmap(pte)) {
1043 page = pte_page(pte);
1044
1045
1046
1047
1048
1049
1050 if (is_device_public_page(page)) {
1051 get_page(page);
1052 page_dup_rmap(page, false);
1053 rss[mm_counter(page)]++;
1054 }
1055 }
1056
1057out_set_pte:
1058 set_pte_at(dst_mm, addr, dst_pte, pte);
1059 return 0;
1060}
1061
1062static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1063 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
1064 unsigned long addr, unsigned long end)
1065{
1066 pte_t *orig_src_pte, *orig_dst_pte;
1067 pte_t *src_pte, *dst_pte;
1068 spinlock_t *src_ptl, *dst_ptl;
1069 int progress = 0;
1070 int rss[NR_MM_COUNTERS];
1071 swp_entry_t entry = (swp_entry_t){0};
1072
1073again:
1074 init_rss_vec(rss);
1075
1076 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
1077 if (!dst_pte)
1078 return -ENOMEM;
1079 src_pte = pte_offset_map(src_pmd, addr);
1080 src_ptl = pte_lockptr(src_mm, src_pmd);
1081 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
1082 orig_src_pte = src_pte;
1083 orig_dst_pte = dst_pte;
1084 arch_enter_lazy_mmu_mode();
1085
1086 do {
1087
1088
1089
1090
1091 if (progress >= 32) {
1092 progress = 0;
1093 if (need_resched() ||
1094 spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
1095 break;
1096 }
1097 if (pte_none(*src_pte)) {
1098 progress++;
1099 continue;
1100 }
1101 entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
1102 vma, addr, rss);
1103 if (entry.val)
1104 break;
1105 progress += 8;
1106 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
1107
1108 arch_leave_lazy_mmu_mode();
1109 spin_unlock(src_ptl);
1110 pte_unmap(orig_src_pte);
1111 add_mm_rss_vec(dst_mm, rss);
1112 pte_unmap_unlock(orig_dst_pte, dst_ptl);
1113 cond_resched();
1114
1115 if (entry.val) {
1116 if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
1117 return -ENOMEM;
1118 progress = 0;
1119 }
1120 if (addr != end)
1121 goto again;
1122 return 0;
1123}
1124
1125static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1126 pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
1127 unsigned long addr, unsigned long end)
1128{
1129 pmd_t *src_pmd, *dst_pmd;
1130 unsigned long next;
1131
1132 dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
1133 if (!dst_pmd)
1134 return -ENOMEM;
1135 src_pmd = pmd_offset(src_pud, addr);
1136 do {
1137 next = pmd_addr_end(addr, end);
1138 if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)
1139 || pmd_devmap(*src_pmd)) {
1140 int err;
1141 VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, vma);
1142 err = copy_huge_pmd(dst_mm, src_mm,
1143 dst_pmd, src_pmd, addr, vma);
1144 if (err == -ENOMEM)
1145 return -ENOMEM;
1146 if (!err)
1147 continue;
1148
1149 }
1150 if (pmd_none_or_clear_bad(src_pmd))
1151 continue;
1152 if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
1153 vma, addr, next))
1154 return -ENOMEM;
1155 } while (dst_pmd++, src_pmd++, addr = next, addr != end);
1156 return 0;
1157}
1158
1159static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1160 p4d_t *dst_p4d, p4d_t *src_p4d, struct vm_area_struct *vma,
1161 unsigned long addr, unsigned long end)
1162{
1163 pud_t *src_pud, *dst_pud;
1164 unsigned long next;
1165
1166 dst_pud = pud_alloc(dst_mm, dst_p4d, addr);
1167 if (!dst_pud)
1168 return -ENOMEM;
1169 src_pud = pud_offset(src_p4d, addr);
1170 do {
1171 next = pud_addr_end(addr, end);
1172 if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) {
1173 int err;
1174
1175 VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, vma);
1176 err = copy_huge_pud(dst_mm, src_mm,
1177 dst_pud, src_pud, addr, vma);
1178 if (err == -ENOMEM)
1179 return -ENOMEM;
1180 if (!err)
1181 continue;
1182
1183 }
1184 if (pud_none_or_clear_bad(src_pud))
1185 continue;
1186 if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
1187 vma, addr, next))
1188 return -ENOMEM;
1189 } while (dst_pud++, src_pud++, addr = next, addr != end);
1190 return 0;
1191}
1192
1193static inline int copy_p4d_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1194 pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
1195 unsigned long addr, unsigned long end)
1196{
1197 p4d_t *src_p4d, *dst_p4d;
1198 unsigned long next;
1199
1200 dst_p4d = p4d_alloc(dst_mm, dst_pgd, addr);
1201 if (!dst_p4d)
1202 return -ENOMEM;
1203 src_p4d = p4d_offset(src_pgd, addr);
1204 do {
1205 next = p4d_addr_end(addr, end);
1206 if (p4d_none_or_clear_bad(src_p4d))
1207 continue;
1208 if (copy_pud_range(dst_mm, src_mm, dst_p4d, src_p4d,
1209 vma, addr, next))
1210 return -ENOMEM;
1211 } while (dst_p4d++, src_p4d++, addr = next, addr != end);
1212 return 0;
1213}
1214
1215int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1216 struct vm_area_struct *vma)
1217{
1218 pgd_t *src_pgd, *dst_pgd;
1219 unsigned long next;
1220 unsigned long addr = vma->vm_start;
1221 unsigned long end = vma->vm_end;
1222 unsigned long mmun_start;
1223 unsigned long mmun_end;
1224 bool is_cow;
1225 int ret;
1226
1227
1228
1229
1230
1231
1232
1233 if (!(vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) &&
1234 !vma->anon_vma)
1235 return 0;
1236
1237 if (is_vm_hugetlb_page(vma))
1238 return copy_hugetlb_page_range(dst_mm, src_mm, vma);
1239
1240 if (unlikely(vma->vm_flags & VM_PFNMAP)) {
1241
1242
1243
1244
1245 ret = track_pfn_copy(vma);
1246 if (ret)
1247 return ret;
1248 }
1249
1250
1251
1252
1253
1254
1255
1256 is_cow = is_cow_mapping(vma->vm_flags);
1257 mmun_start = addr;
1258 mmun_end = end;
1259 if (is_cow)
1260 mmu_notifier_invalidate_range_start(src_mm, mmun_start,
1261 mmun_end);
1262
1263 ret = 0;
1264 dst_pgd = pgd_offset(dst_mm, addr);
1265 src_pgd = pgd_offset(src_mm, addr);
1266 do {
1267 next = pgd_addr_end(addr, end);
1268 if (pgd_none_or_clear_bad(src_pgd))
1269 continue;
1270 if (unlikely(copy_p4d_range(dst_mm, src_mm, dst_pgd, src_pgd,
1271 vma, addr, next))) {
1272 ret = -ENOMEM;
1273 break;
1274 }
1275 } while (dst_pgd++, src_pgd++, addr = next, addr != end);
1276
1277 if (is_cow)
1278 mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end);
1279 return ret;
1280}
1281
1282static unsigned long zap_pte_range(struct mmu_gather *tlb,
1283 struct vm_area_struct *vma, pmd_t *pmd,
1284 unsigned long addr, unsigned long end,
1285 struct zap_details *details)
1286{
1287 struct mm_struct *mm = tlb->mm;
1288 int force_flush = 0;
1289 int rss[NR_MM_COUNTERS];
1290 spinlock_t *ptl;
1291 pte_t *start_pte;
1292 pte_t *pte;
1293 swp_entry_t entry;
1294
1295 tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
1296again:
1297 init_rss_vec(rss);
1298 start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
1299 pte = start_pte;
1300 flush_tlb_batched_pending(mm);
1301 arch_enter_lazy_mmu_mode();
1302 do {
1303 pte_t ptent = *pte;
1304 if (pte_none(ptent))
1305 continue;
1306
1307 if (pte_present(ptent)) {
1308 struct page *page;
1309
1310 page = _vm_normal_page(vma, addr, ptent, true);
1311 if (unlikely(details) && page) {
1312
1313
1314
1315
1316
1317 if (details->check_mapping &&
1318 details->check_mapping != page_rmapping(page))
1319 continue;
1320 }
1321 ptent = ptep_get_and_clear_full(mm, addr, pte,
1322 tlb->fullmm);
1323 tlb_remove_tlb_entry(tlb, pte, addr);
1324 if (unlikely(!page))
1325 continue;
1326
1327 if (!PageAnon(page)) {
1328 if (pte_dirty(ptent)) {
1329 force_flush = 1;
1330 set_page_dirty(page);
1331 }
1332 if (pte_young(ptent) &&
1333 likely(!(vma->vm_flags & VM_SEQ_READ)))
1334 mark_page_accessed(page);
1335 }
1336 rss[mm_counter(page)]--;
1337 page_remove_rmap(page, false);
1338 if (unlikely(page_mapcount(page) < 0))
1339 print_bad_pte(vma, addr, ptent, page);
1340 if (unlikely(__tlb_remove_page(tlb, page))) {
1341 force_flush = 1;
1342 addr += PAGE_SIZE;
1343 break;
1344 }
1345 continue;
1346 }
1347
1348 entry = pte_to_swp_entry(ptent);
1349 if (non_swap_entry(entry) && is_device_private_entry(entry)) {
1350 struct page *page = device_private_entry_to_page(entry);
1351
1352 if (unlikely(details && details->check_mapping)) {
1353
1354
1355
1356
1357
1358 if (details->check_mapping !=
1359 page_rmapping(page))
1360 continue;
1361 }
1362
1363 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
1364 rss[mm_counter(page)]--;
1365 page_remove_rmap(page, false);
1366 put_page(page);
1367 continue;
1368 }
1369
1370
1371 if (unlikely(details))
1372 continue;
1373
1374 entry = pte_to_swp_entry(ptent);
1375 if (!non_swap_entry(entry))
1376 rss[MM_SWAPENTS]--;
1377 else if (is_migration_entry(entry)) {
1378 struct page *page;
1379
1380 page = migration_entry_to_page(entry);
1381 rss[mm_counter(page)]--;
1382 }
1383 if (unlikely(!free_swap_and_cache(entry)))
1384 print_bad_pte(vma, addr, ptent, NULL);
1385 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
1386 } while (pte++, addr += PAGE_SIZE, addr != end);
1387
1388 add_mm_rss_vec(mm, rss);
1389 arch_leave_lazy_mmu_mode();
1390
1391
1392 if (force_flush)
1393 tlb_flush_mmu_tlbonly(tlb);
1394 pte_unmap_unlock(start_pte, ptl);
1395
1396
1397
1398
1399
1400
1401
1402 if (force_flush) {
1403 force_flush = 0;
1404 tlb_flush_mmu_free(tlb);
1405 if (addr != end)
1406 goto again;
1407 }
1408
1409 return addr;
1410}
1411
1412static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1413 struct vm_area_struct *vma, pud_t *pud,
1414 unsigned long addr, unsigned long end,
1415 struct zap_details *details)
1416{
1417 pmd_t *pmd;
1418 unsigned long next;
1419
1420 pmd = pmd_offset(pud, addr);
1421 do {
1422 next = pmd_addr_end(addr, end);
1423 if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
1424 if (next - addr != HPAGE_PMD_SIZE) {
1425 VM_BUG_ON_VMA(vma_is_anonymous(vma) &&
1426 !rwsem_is_locked(&tlb->mm->mmap_sem), vma);
1427 __split_huge_pmd(vma, pmd, addr, false, NULL);
1428 } else if (zap_huge_pmd(tlb, vma, pmd, addr))
1429 goto next;
1430
1431 }
1432
1433
1434
1435
1436
1437
1438
1439 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
1440 goto next;
1441 next = zap_pte_range(tlb, vma, pmd, addr, next, details);
1442next:
1443 cond_resched();
1444 } while (pmd++, addr = next, addr != end);
1445
1446 return addr;
1447}
1448
1449static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1450 struct vm_area_struct *vma, p4d_t *p4d,
1451 unsigned long addr, unsigned long end,
1452 struct zap_details *details)
1453{
1454 pud_t *pud;
1455 unsigned long next;
1456
1457 pud = pud_offset(p4d, addr);
1458 do {
1459 next = pud_addr_end(addr, end);
1460 if (pud_trans_huge(*pud) || pud_devmap(*pud)) {
1461 if (next - addr != HPAGE_PUD_SIZE) {
1462 VM_BUG_ON_VMA(!rwsem_is_locked(&tlb->mm->mmap_sem), vma);
1463 split_huge_pud(vma, pud, addr);
1464 } else if (zap_huge_pud(tlb, vma, pud, addr))
1465 goto next;
1466
1467 }
1468 if (pud_none_or_clear_bad(pud))
1469 continue;
1470 next = zap_pmd_range(tlb, vma, pud, addr, next, details);
1471next:
1472 cond_resched();
1473 } while (pud++, addr = next, addr != end);
1474
1475 return addr;
1476}
1477
1478static inline unsigned long zap_p4d_range(struct mmu_gather *tlb,
1479 struct vm_area_struct *vma, pgd_t *pgd,
1480 unsigned long addr, unsigned long end,
1481 struct zap_details *details)
1482{
1483 p4d_t *p4d;
1484 unsigned long next;
1485
1486 p4d = p4d_offset(pgd, addr);
1487 do {
1488 next = p4d_addr_end(addr, end);
1489 if (p4d_none_or_clear_bad(p4d))
1490 continue;
1491 next = zap_pud_range(tlb, vma, p4d, addr, next, details);
1492 } while (p4d++, addr = next, addr != end);
1493
1494 return addr;
1495}
1496
1497void unmap_page_range(struct mmu_gather *tlb,
1498 struct vm_area_struct *vma,
1499 unsigned long addr, unsigned long end,
1500 struct zap_details *details)
1501{
1502 pgd_t *pgd;
1503 unsigned long next;
1504
1505 BUG_ON(addr >= end);
1506 tlb_start_vma(tlb, vma);
1507 pgd = pgd_offset(vma->vm_mm, addr);
1508 do {
1509 next = pgd_addr_end(addr, end);
1510 if (pgd_none_or_clear_bad(pgd))
1511 continue;
1512 next = zap_p4d_range(tlb, vma, pgd, addr, next, details);
1513 } while (pgd++, addr = next, addr != end);
1514 tlb_end_vma(tlb, vma);
1515}
1516
1517
1518static void unmap_single_vma(struct mmu_gather *tlb,
1519 struct vm_area_struct *vma, unsigned long start_addr,
1520 unsigned long end_addr,
1521 struct zap_details *details)
1522{
1523 unsigned long start = max(vma->vm_start, start_addr);
1524 unsigned long end;
1525
1526 if (start >= vma->vm_end)
1527 return;
1528 end = min(vma->vm_end, end_addr);
1529 if (end <= vma->vm_start)
1530 return;
1531
1532 if (vma->vm_file)
1533 uprobe_munmap(vma, start, end);
1534
1535 if (unlikely(vma->vm_flags & VM_PFNMAP))
1536 untrack_pfn(vma, 0, 0);
1537
1538 if (start != end) {
1539 if (unlikely(is_vm_hugetlb_page(vma))) {
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551 if (vma->vm_file) {
1552 i_mmap_lock_write(vma->vm_file->f_mapping);
1553 __unmap_hugepage_range_final(tlb, vma, start, end, NULL);
1554 i_mmap_unlock_write(vma->vm_file->f_mapping);
1555 }
1556 } else
1557 unmap_page_range(tlb, vma, start, end, details);
1558 }
1559}
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579void unmap_vmas(struct mmu_gather *tlb,
1580 struct vm_area_struct *vma, unsigned long start_addr,
1581 unsigned long end_addr)
1582{
1583 struct mm_struct *mm = vma->vm_mm;
1584
1585 mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
1586 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
1587 unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
1588 mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
1589}
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599void zap_page_range(struct vm_area_struct *vma, unsigned long start,
1600 unsigned long size)
1601{
1602 struct mm_struct *mm = vma->vm_mm;
1603 struct mmu_gather tlb;
1604 unsigned long end = start + size;
1605
1606 lru_add_drain();
1607 tlb_gather_mmu(&tlb, mm, start, end);
1608 update_hiwater_rss(mm);
1609 mmu_notifier_invalidate_range_start(mm, start, end);
1610 for ( ; vma && vma->vm_start < end; vma = vma->vm_next) {
1611 unmap_single_vma(&tlb, vma, start, end, NULL);
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621 flush_tlb_range(vma, start, end);
1622 }
1623
1624 mmu_notifier_invalidate_range_end(mm, start, end);
1625 tlb_finish_mmu(&tlb, start, end);
1626}
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
1638 unsigned long size, struct zap_details *details)
1639{
1640 struct mm_struct *mm = vma->vm_mm;
1641 struct mmu_gather tlb;
1642 unsigned long end = address + size;
1643
1644 lru_add_drain();
1645 tlb_gather_mmu(&tlb, mm, address, end);
1646 update_hiwater_rss(mm);
1647 mmu_notifier_invalidate_range_start(mm, address, end);
1648 unmap_single_vma(&tlb, vma, address, end, details);
1649 mmu_notifier_invalidate_range_end(mm, address, end);
1650 tlb_finish_mmu(&tlb, address, end);
1651}
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
1666 unsigned long size)
1667{
1668 if (address < vma->vm_start || address + size > vma->vm_end ||
1669 !(vma->vm_flags & VM_PFNMAP))
1670 return -1;
1671 zap_page_range_single(vma, address, size, NULL);
1672 return 0;
1673}
1674EXPORT_SYMBOL_GPL(zap_vma_ptes);
1675
1676pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
1677 spinlock_t **ptl)
1678{
1679 pgd_t *pgd;
1680 p4d_t *p4d;
1681 pud_t *pud;
1682 pmd_t *pmd;
1683
1684 pgd = pgd_offset(mm, addr);
1685 p4d = p4d_alloc(mm, pgd, addr);
1686 if (!p4d)
1687 return NULL;
1688 pud = pud_alloc(mm, p4d, addr);
1689 if (!pud)
1690 return NULL;
1691 pmd = pmd_alloc(mm, pud, addr);
1692 if (!pmd)
1693 return NULL;
1694
1695 VM_BUG_ON(pmd_trans_huge(*pmd));
1696 return pte_alloc_map_lock(mm, pmd, addr, ptl);
1697}
1698
1699
1700
1701
1702
1703
1704
1705
1706static int insert_page(struct vm_area_struct *vma, unsigned long addr,
1707 struct page *page, pgprot_t prot)
1708{
1709 struct mm_struct *mm = vma->vm_mm;
1710 int retval;
1711 pte_t *pte;
1712 spinlock_t *ptl;
1713
1714 retval = -EINVAL;
1715 if (PageAnon(page))
1716 goto out;
1717 retval = -ENOMEM;
1718 flush_dcache_page(page);
1719 pte = get_locked_pte(mm, addr, &ptl);
1720 if (!pte)
1721 goto out;
1722 retval = -EBUSY;
1723 if (!pte_none(*pte))
1724 goto out_unlock;
1725
1726
1727 get_page(page);
1728 inc_mm_counter_fast(mm, mm_counter_file(page));
1729 page_add_file_rmap(page, false);
1730 set_pte_at(mm, addr, pte, mk_pte(page, prot));
1731
1732 retval = 0;
1733 pte_unmap_unlock(pte, ptl);
1734 return retval;
1735out_unlock:
1736 pte_unmap_unlock(pte, ptl);
1737out:
1738 return retval;
1739}
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
1769 struct page *page)
1770{
1771 if (addr < vma->vm_start || addr >= vma->vm_end)
1772 return -EFAULT;
1773 if (!page_count(page))
1774 return -EINVAL;
1775 if (!(vma->vm_flags & VM_MIXEDMAP)) {
1776 BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem));
1777 BUG_ON(vma->vm_flags & VM_PFNMAP);
1778 vma->vm_flags |= VM_MIXEDMAP;
1779 }
1780 return insert_page(vma, addr, page, vma->vm_page_prot);
1781}
1782EXPORT_SYMBOL(vm_insert_page);
1783
1784static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1785 pfn_t pfn, pgprot_t prot, bool mkwrite)
1786{
1787 struct mm_struct *mm = vma->vm_mm;
1788 int retval;
1789 pte_t *pte, entry;
1790 spinlock_t *ptl;
1791
1792 retval = -ENOMEM;
1793 pte = get_locked_pte(mm, addr, &ptl);
1794 if (!pte)
1795 goto out;
1796 retval = -EBUSY;
1797 if (!pte_none(*pte)) {
1798 if (mkwrite) {
1799
1800
1801
1802
1803
1804
1805
1806 if (WARN_ON_ONCE(pte_pfn(*pte) != pfn_t_to_pfn(pfn)))
1807 goto out_unlock;
1808 entry = *pte;
1809 goto out_mkwrite;
1810 } else
1811 goto out_unlock;
1812 }
1813
1814
1815 if (pfn_t_devmap(pfn))
1816 entry = pte_mkdevmap(pfn_t_pte(pfn, prot));
1817 else
1818 entry = pte_mkspecial(pfn_t_pte(pfn, prot));
1819
1820out_mkwrite:
1821 if (mkwrite) {
1822 entry = pte_mkyoung(entry);
1823 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1824 }
1825
1826 set_pte_at(mm, addr, pte, entry);
1827 update_mmu_cache(vma, addr, pte);
1828
1829 retval = 0;
1830out_unlock:
1831 pte_unmap_unlock(pte, ptl);
1832out:
1833 return retval;
1834}
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1854 unsigned long pfn)
1855{
1856 return vm_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
1857}
1858EXPORT_SYMBOL(vm_insert_pfn);
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
1876 unsigned long pfn, pgprot_t pgprot)
1877{
1878 int ret;
1879
1880
1881
1882
1883
1884
1885 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
1886 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
1887 (VM_PFNMAP|VM_MIXEDMAP));
1888 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
1889 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
1890
1891 if (addr < vma->vm_start || addr >= vma->vm_end)
1892 return -EFAULT;
1893
1894 track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
1895
1896 ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
1897 false);
1898
1899 return ret;
1900}
1901EXPORT_SYMBOL(vm_insert_pfn_prot);
1902
1903static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
1904{
1905
1906 if (vma->vm_flags & VM_MIXEDMAP)
1907 return true;
1908 if (pfn_t_devmap(pfn))
1909 return true;
1910 if (pfn_t_special(pfn))
1911 return true;
1912 if (is_zero_pfn(pfn_t_to_pfn(pfn)))
1913 return true;
1914 return false;
1915}
1916
1917static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
1918 pfn_t pfn, bool mkwrite)
1919{
1920 pgprot_t pgprot = vma->vm_page_prot;
1921
1922 BUG_ON(!vm_mixed_ok(vma, pfn));
1923
1924 if (addr < vma->vm_start || addr >= vma->vm_end)
1925 return -EFAULT;
1926
1927 track_pfn_insert(vma, &pgprot, pfn);
1928
1929
1930
1931
1932
1933
1934
1935
1936 if (!HAVE_PTE_SPECIAL && !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) {
1937 struct page *page;
1938
1939
1940
1941
1942
1943
1944 page = pfn_to_page(pfn_t_to_pfn(pfn));
1945 return insert_page(vma, addr, page, pgprot);
1946 }
1947 return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
1948}
1949
1950int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
1951 pfn_t pfn)
1952{
1953 return __vm_insert_mixed(vma, addr, pfn, false);
1954
1955}
1956EXPORT_SYMBOL(vm_insert_mixed);
1957
1958int vm_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr,
1959 pfn_t pfn)
1960{
1961 return __vm_insert_mixed(vma, addr, pfn, true);
1962}
1963EXPORT_SYMBOL(vm_insert_mixed_mkwrite);
1964
1965
1966
1967
1968
1969
1970static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1971 unsigned long addr, unsigned long end,
1972 unsigned long pfn, pgprot_t prot)
1973{
1974 pte_t *pte;
1975 spinlock_t *ptl;
1976
1977 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
1978 if (!pte)
1979 return -ENOMEM;
1980 arch_enter_lazy_mmu_mode();
1981 do {
1982 BUG_ON(!pte_none(*pte));
1983 set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
1984 pfn++;
1985 } while (pte++, addr += PAGE_SIZE, addr != end);
1986 arch_leave_lazy_mmu_mode();
1987 pte_unmap_unlock(pte - 1, ptl);
1988 return 0;
1989}
1990
1991static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
1992 unsigned long addr, unsigned long end,
1993 unsigned long pfn, pgprot_t prot)
1994{
1995 pmd_t *pmd;
1996 unsigned long next;
1997
1998 pfn -= addr >> PAGE_SHIFT;
1999 pmd = pmd_alloc(mm, pud, addr);
2000 if (!pmd)
2001 return -ENOMEM;
2002 VM_BUG_ON(pmd_trans_huge(*pmd));
2003 do {
2004 next = pmd_addr_end(addr, end);
2005 if (remap_pte_range(mm, pmd, addr, next,
2006 pfn + (addr >> PAGE_SHIFT), prot))
2007 return -ENOMEM;
2008 } while (pmd++, addr = next, addr != end);
2009 return 0;
2010}
2011
2012static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d,
2013 unsigned long addr, unsigned long end,
2014 unsigned long pfn, pgprot_t prot)
2015{
2016 pud_t *pud;
2017 unsigned long next;
2018
2019 pfn -= addr >> PAGE_SHIFT;
2020 pud = pud_alloc(mm, p4d, addr);
2021 if (!pud)
2022 return -ENOMEM;
2023 do {
2024 next = pud_addr_end(addr, end);
2025 if (remap_pmd_range(mm, pud, addr, next,
2026 pfn + (addr >> PAGE_SHIFT), prot))
2027 return -ENOMEM;
2028 } while (pud++, addr = next, addr != end);
2029 return 0;
2030}
2031
2032static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
2033 unsigned long addr, unsigned long end,
2034 unsigned long pfn, pgprot_t prot)
2035{
2036 p4d_t *p4d;
2037 unsigned long next;
2038
2039 pfn -= addr >> PAGE_SHIFT;
2040 p4d = p4d_alloc(mm, pgd, addr);
2041 if (!p4d)
2042 return -ENOMEM;
2043 do {
2044 next = p4d_addr_end(addr, end);
2045 if (remap_pud_range(mm, p4d, addr, next,
2046 pfn + (addr >> PAGE_SHIFT), prot))
2047 return -ENOMEM;
2048 } while (p4d++, addr = next, addr != end);
2049 return 0;
2050}
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
2063 unsigned long pfn, unsigned long size, pgprot_t prot)
2064{
2065 pgd_t *pgd;
2066 unsigned long next;
2067 unsigned long end = addr + PAGE_ALIGN(size);
2068 struct mm_struct *mm = vma->vm_mm;
2069 unsigned long remap_pfn = pfn;
2070 int err;
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090 if (is_cow_mapping(vma->vm_flags)) {
2091 if (addr != vma->vm_start || end != vma->vm_end)
2092 return -EINVAL;
2093 vma->vm_pgoff = pfn;
2094 }
2095
2096 err = track_pfn_remap(vma, &prot, remap_pfn, addr, PAGE_ALIGN(size));
2097 if (err)
2098 return -EINVAL;
2099
2100 vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
2101
2102 BUG_ON(addr >= end);
2103 pfn -= addr >> PAGE_SHIFT;
2104 pgd = pgd_offset(mm, addr);
2105 flush_cache_range(vma, addr, end);
2106 do {
2107 next = pgd_addr_end(addr, end);
2108 err = remap_p4d_range(mm, pgd, addr, next,
2109 pfn + (addr >> PAGE_SHIFT), prot);
2110 if (err)
2111 break;
2112 } while (pgd++, addr = next, addr != end);
2113
2114 if (err)
2115 untrack_pfn(vma, remap_pfn, PAGE_ALIGN(size));
2116
2117 return err;
2118}
2119EXPORT_SYMBOL(remap_pfn_range);
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
2135{
2136 unsigned long vm_len, pfn, pages;
2137
2138
2139 if (start + len < start)
2140 return -EINVAL;
2141
2142
2143
2144
2145
2146 len += start & ~PAGE_MASK;
2147 pfn = start >> PAGE_SHIFT;
2148 pages = (len + ~PAGE_MASK) >> PAGE_SHIFT;
2149 if (pfn + pages < pfn)
2150 return -EINVAL;
2151
2152
2153 if (vma->vm_pgoff > pages)
2154 return -EINVAL;
2155 pfn += vma->vm_pgoff;
2156 pages -= vma->vm_pgoff;
2157
2158
2159 vm_len = vma->vm_end - vma->vm_start;
2160 if (vm_len >> PAGE_SHIFT > pages)
2161 return -EINVAL;
2162
2163
2164 return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
2165}
2166EXPORT_SYMBOL(vm_iomap_memory);
2167
2168static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
2169 unsigned long addr, unsigned long end,
2170 pte_fn_t fn, void *data)
2171{
2172 pte_t *pte;
2173 int err;
2174 pgtable_t token;
2175 spinlock_t *uninitialized_var(ptl);
2176
2177 pte = (mm == &init_mm) ?
2178 pte_alloc_kernel(pmd, addr) :
2179 pte_alloc_map_lock(mm, pmd, addr, &ptl);
2180 if (!pte)
2181 return -ENOMEM;
2182
2183 BUG_ON(pmd_huge(*pmd));
2184
2185 arch_enter_lazy_mmu_mode();
2186
2187 token = pmd_pgtable(*pmd);
2188
2189 do {
2190 err = fn(pte++, token, addr, data);
2191 if (err)
2192 break;
2193 } while (addr += PAGE_SIZE, addr != end);
2194
2195 arch_leave_lazy_mmu_mode();
2196
2197 if (mm != &init_mm)
2198 pte_unmap_unlock(pte-1, ptl);
2199 return err;
2200}
2201
2202static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
2203 unsigned long addr, unsigned long end,
2204 pte_fn_t fn, void *data)
2205{
2206 pmd_t *pmd;
2207 unsigned long next;
2208 int err;
2209
2210 BUG_ON(pud_huge(*pud));
2211
2212 pmd = pmd_alloc(mm, pud, addr);
2213 if (!pmd)
2214 return -ENOMEM;
2215 do {
2216 next = pmd_addr_end(addr, end);
2217 err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
2218 if (err)
2219 break;
2220 } while (pmd++, addr = next, addr != end);
2221 return err;
2222}
2223
2224static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
2225 unsigned long addr, unsigned long end,
2226 pte_fn_t fn, void *data)
2227{
2228 pud_t *pud;
2229 unsigned long next;
2230 int err;
2231
2232 pud = pud_alloc(mm, p4d, addr);
2233 if (!pud)
2234 return -ENOMEM;
2235 do {
2236 next = pud_addr_end(addr, end);
2237 err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
2238 if (err)
2239 break;
2240 } while (pud++, addr = next, addr != end);
2241 return err;
2242}
2243
2244static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
2245 unsigned long addr, unsigned long end,
2246 pte_fn_t fn, void *data)
2247{
2248 p4d_t *p4d;
2249 unsigned long next;
2250 int err;
2251
2252 p4d = p4d_alloc(mm, pgd, addr);
2253 if (!p4d)
2254 return -ENOMEM;
2255 do {
2256 next = p4d_addr_end(addr, end);
2257 err = apply_to_pud_range(mm, p4d, addr, next, fn, data);
2258 if (err)
2259 break;
2260 } while (p4d++, addr = next, addr != end);
2261 return err;
2262}
2263
2264
2265
2266
2267
2268int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
2269 unsigned long size, pte_fn_t fn, void *data)
2270{
2271 pgd_t *pgd;
2272 unsigned long next;
2273 unsigned long end = addr + size;
2274 int err;
2275
2276 if (WARN_ON(addr >= end))
2277 return -EINVAL;
2278
2279 pgd = pgd_offset(mm, addr);
2280 do {
2281 next = pgd_addr_end(addr, end);
2282 err = apply_to_p4d_range(mm, pgd, addr, next, fn, data);
2283 if (err)
2284 break;
2285 } while (pgd++, addr = next, addr != end);
2286
2287 return err;
2288}
2289EXPORT_SYMBOL_GPL(apply_to_page_range);
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
2300 pte_t *page_table, pte_t orig_pte)
2301{
2302 int same = 1;
2303#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
2304 if (sizeof(pte_t) > sizeof(unsigned long)) {
2305 spinlock_t *ptl = pte_lockptr(mm, pmd);
2306 spin_lock(ptl);
2307 same = pte_same(*page_table, orig_pte);
2308 spin_unlock(ptl);
2309 }
2310#endif
2311 pte_unmap(page_table);
2312 return same;
2313}
2314
2315static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
2316{
2317 debug_dma_assert_idle(src);
2318
2319
2320
2321
2322
2323
2324
2325 if (unlikely(!src)) {
2326 void *kaddr = kmap_atomic(dst);
2327 void __user *uaddr = (void __user *)(va & PAGE_MASK);
2328
2329
2330
2331
2332
2333
2334
2335 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
2336 clear_page(kaddr);
2337 kunmap_atomic(kaddr);
2338 flush_dcache_page(dst);
2339 } else
2340 copy_user_highpage(dst, src, va, vma);
2341}
2342
2343static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
2344{
2345 struct file *vm_file = vma->vm_file;
2346
2347 if (vm_file)
2348 return mapping_gfp_mask(vm_file->f_mapping) | __GFP_FS | __GFP_IO;
2349
2350
2351
2352
2353
2354 return GFP_KERNEL;
2355}
2356
2357
2358
2359
2360
2361
2362
2363static int do_page_mkwrite(struct vm_fault *vmf)
2364{
2365 int ret;
2366 struct page *page = vmf->page;
2367 unsigned int old_flags = vmf->flags;
2368
2369 vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
2370
2371 ret = vmf->vma->vm_ops->page_mkwrite(vmf);
2372
2373 vmf->flags = old_flags;
2374 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
2375 return ret;
2376 if (unlikely(!(ret & VM_FAULT_LOCKED))) {
2377 lock_page(page);
2378 if (!page->mapping) {
2379 unlock_page(page);
2380 return 0;
2381 }
2382 ret |= VM_FAULT_LOCKED;
2383 } else
2384 VM_BUG_ON_PAGE(!PageLocked(page), page);
2385 return ret;
2386}
2387
2388
2389
2390
2391
2392
2393static void fault_dirty_shared_page(struct vm_area_struct *vma,
2394 struct page *page)
2395{
2396 struct address_space *mapping;
2397 bool dirtied;
2398 bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;
2399
2400 dirtied = set_page_dirty(page);
2401 VM_BUG_ON_PAGE(PageAnon(page), page);
2402
2403
2404
2405
2406
2407
2408 mapping = page_rmapping(page);
2409 unlock_page(page);
2410
2411 if ((dirtied || page_mkwrite) && mapping) {
2412
2413
2414
2415
2416 balance_dirty_pages_ratelimited(mapping);
2417 }
2418
2419 if (!page_mkwrite)
2420 file_update_time(vma->vm_file);
2421}
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431static inline void wp_page_reuse(struct vm_fault *vmf)
2432 __releases(vmf->ptl)
2433{
2434 struct vm_area_struct *vma = vmf->vma;
2435 struct page *page = vmf->page;
2436 pte_t entry;
2437
2438
2439
2440
2441
2442 if (page)
2443 page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);
2444
2445 flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
2446 entry = pte_mkyoung(vmf->orig_pte);
2447 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2448 if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
2449 update_mmu_cache(vma, vmf->address, vmf->pte);
2450 pte_unmap_unlock(vmf->pte, vmf->ptl);
2451}
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469static int wp_page_copy(struct vm_fault *vmf)
2470{
2471 struct vm_area_struct *vma = vmf->vma;
2472 struct mm_struct *mm = vma->vm_mm;
2473 struct page *old_page = vmf->page;
2474 struct page *new_page = NULL;
2475 pte_t entry;
2476 int page_copied = 0;
2477 const unsigned long mmun_start = vmf->address & PAGE_MASK;
2478 const unsigned long mmun_end = mmun_start + PAGE_SIZE;
2479 struct mem_cgroup *memcg;
2480
2481 if (unlikely(anon_vma_prepare(vma)))
2482 goto oom;
2483
2484 if (is_zero_pfn(pte_pfn(vmf->orig_pte))) {
2485 new_page = alloc_zeroed_user_highpage_movable(vma,
2486 vmf->address);
2487 if (!new_page)
2488 goto oom;
2489 } else {
2490 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
2491 vmf->address);
2492 if (!new_page)
2493 goto oom;
2494 cow_user_page(new_page, old_page, vmf->address, vma);
2495 }
2496
2497 if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false))
2498 goto oom_free_new;
2499
2500 __SetPageUptodate(new_page);
2501
2502 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2503
2504
2505
2506
2507 vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
2508 if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
2509 if (old_page) {
2510 if (!PageAnon(old_page)) {
2511 dec_mm_counter_fast(mm,
2512 mm_counter_file(old_page));
2513 inc_mm_counter_fast(mm, MM_ANONPAGES);
2514 }
2515 } else {
2516 inc_mm_counter_fast(mm, MM_ANONPAGES);
2517 }
2518 flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
2519 entry = mk_pte(new_page, vma->vm_page_prot);
2520 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2521
2522
2523
2524
2525
2526
2527 ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
2528 page_add_new_anon_rmap(new_page, vma, vmf->address, false);
2529 mem_cgroup_commit_charge(new_page, memcg, false, false);
2530 lru_cache_add_active_or_unevictable(new_page, vma);
2531
2532
2533
2534
2535
2536 set_pte_at_notify(mm, vmf->address, vmf->pte, entry);
2537 update_mmu_cache(vma, vmf->address, vmf->pte);
2538 if (old_page) {
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561 page_remove_rmap(old_page, false);
2562 }
2563
2564
2565 new_page = old_page;
2566 page_copied = 1;
2567 } else {
2568 mem_cgroup_cancel_charge(new_page, memcg, false);
2569 }
2570
2571 if (new_page)
2572 put_page(new_page);
2573
2574 pte_unmap_unlock(vmf->pte, vmf->ptl);
2575
2576
2577
2578
2579 mmu_notifier_invalidate_range_only_end(mm, mmun_start, mmun_end);
2580 if (old_page) {
2581
2582
2583
2584
2585 if (page_copied && (vma->vm_flags & VM_LOCKED)) {
2586 lock_page(old_page);
2587 if (PageMlocked(old_page))
2588 munlock_vma_page(old_page);
2589 unlock_page(old_page);
2590 }
2591 put_page(old_page);
2592 }
2593 return page_copied ? VM_FAULT_WRITE : 0;
2594oom_free_new:
2595 put_page(new_page);
2596oom:
2597 if (old_page)
2598 put_page(old_page);
2599 return VM_FAULT_OOM;
2600}
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617int finish_mkwrite_fault(struct vm_fault *vmf)
2618{
2619 WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
2620 vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
2621 &vmf->ptl);
2622
2623
2624
2625
2626 if (!pte_same(*vmf->pte, vmf->orig_pte)) {
2627 pte_unmap_unlock(vmf->pte, vmf->ptl);
2628 return VM_FAULT_NOPAGE;
2629 }
2630 wp_page_reuse(vmf);
2631 return 0;
2632}
2633
2634
2635
2636
2637
2638static int wp_pfn_shared(struct vm_fault *vmf)
2639{
2640 struct vm_area_struct *vma = vmf->vma;
2641
2642 if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
2643 int ret;
2644
2645 pte_unmap_unlock(vmf->pte, vmf->ptl);
2646 vmf->flags |= FAULT_FLAG_MKWRITE;
2647 ret = vma->vm_ops->pfn_mkwrite(vmf);
2648 if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
2649 return ret;
2650 return finish_mkwrite_fault(vmf);
2651 }
2652 wp_page_reuse(vmf);
2653 return VM_FAULT_WRITE;
2654}
2655
2656static int wp_page_shared(struct vm_fault *vmf)
2657 __releases(vmf->ptl)
2658{
2659 struct vm_area_struct *vma = vmf->vma;
2660
2661 get_page(vmf->page);
2662
2663 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
2664 int tmp;
2665
2666 pte_unmap_unlock(vmf->pte, vmf->ptl);
2667 tmp = do_page_mkwrite(vmf);
2668 if (unlikely(!tmp || (tmp &
2669 (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
2670 put_page(vmf->page);
2671 return tmp;
2672 }
2673 tmp = finish_mkwrite_fault(vmf);
2674 if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
2675 unlock_page(vmf->page);
2676 put_page(vmf->page);
2677 return tmp;
2678 }
2679 } else {
2680 wp_page_reuse(vmf);
2681 lock_page(vmf->page);
2682 }
2683 fault_dirty_shared_page(vma, vmf->page);
2684 put_page(vmf->page);
2685
2686 return VM_FAULT_WRITE;
2687}
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707static int do_wp_page(struct vm_fault *vmf)
2708 __releases(vmf->ptl)
2709{
2710 struct vm_area_struct *vma = vmf->vma;
2711
2712 vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
2713 if (!vmf->page) {
2714
2715
2716
2717
2718
2719
2720
2721 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2722 (VM_WRITE|VM_SHARED))
2723 return wp_pfn_shared(vmf);
2724
2725 pte_unmap_unlock(vmf->pte, vmf->ptl);
2726 return wp_page_copy(vmf);
2727 }
2728
2729
2730
2731
2732
2733 if (PageAnon(vmf->page) && !PageKsm(vmf->page)) {
2734 int total_map_swapcount;
2735 if (!trylock_page(vmf->page)) {
2736 get_page(vmf->page);
2737 pte_unmap_unlock(vmf->pte, vmf->ptl);
2738 lock_page(vmf->page);
2739 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
2740 vmf->address, &vmf->ptl);
2741 if (!pte_same(*vmf->pte, vmf->orig_pte)) {
2742 unlock_page(vmf->page);
2743 pte_unmap_unlock(vmf->pte, vmf->ptl);
2744 put_page(vmf->page);
2745 return 0;
2746 }
2747 put_page(vmf->page);
2748 }
2749 if (reuse_swap_page(vmf->page, &total_map_swapcount)) {
2750 if (total_map_swapcount == 1) {
2751
2752
2753
2754
2755
2756
2757
2758 page_move_anon_rmap(vmf->page, vma);
2759 }
2760 unlock_page(vmf->page);
2761 wp_page_reuse(vmf);
2762 return VM_FAULT_WRITE;
2763 }
2764 unlock_page(vmf->page);
2765 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2766 (VM_WRITE|VM_SHARED))) {
2767 return wp_page_shared(vmf);
2768 }
2769
2770
2771
2772
2773 get_page(vmf->page);
2774
2775 pte_unmap_unlock(vmf->pte, vmf->ptl);
2776 return wp_page_copy(vmf);
2777}
2778
2779static void unmap_mapping_range_vma(struct vm_area_struct *vma,
2780 unsigned long start_addr, unsigned long end_addr,
2781 struct zap_details *details)
2782{
2783 zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
2784}
2785
2786static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
2787 struct zap_details *details)
2788{
2789 struct vm_area_struct *vma;
2790 pgoff_t vba, vea, zba, zea;
2791
2792 vma_interval_tree_foreach(vma, root,
2793 details->first_index, details->last_index) {
2794
2795 vba = vma->vm_pgoff;
2796 vea = vba + vma_pages(vma) - 1;
2797 zba = details->first_index;
2798 if (zba < vba)
2799 zba = vba;
2800 zea = details->last_index;
2801 if (zea > vea)
2802 zea = vea;
2803
2804 unmap_mapping_range_vma(vma,
2805 ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
2806 ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
2807 details);
2808 }
2809}
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823void unmap_mapping_pages(struct address_space *mapping, pgoff_t start,
2824 pgoff_t nr, bool even_cows)
2825{
2826 struct zap_details details = { };
2827
2828 details.check_mapping = even_cows ? NULL : mapping;
2829 details.first_index = start;
2830 details.last_index = start + nr - 1;
2831 if (details.last_index < details.first_index)
2832 details.last_index = ULONG_MAX;
2833
2834 i_mmap_lock_write(mapping);
2835 if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
2836 unmap_mapping_range_tree(&mapping->i_mmap, &details);
2837 i_mmap_unlock_write(mapping);
2838}
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857void unmap_mapping_range(struct address_space *mapping,
2858 loff_t const holebegin, loff_t const holelen, int even_cows)
2859{
2860 pgoff_t hba = holebegin >> PAGE_SHIFT;
2861 pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2862
2863
2864 if (sizeof(holelen) > sizeof(hlen)) {
2865 long long holeend =
2866 (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2867 if (holeend & ~(long long)ULONG_MAX)
2868 hlen = ULONG_MAX - hba + 1;
2869 }
2870
2871 unmap_mapping_pages(mapping, hba, hlen, even_cows);
2872}
2873EXPORT_SYMBOL(unmap_mapping_range);
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883int do_swap_page(struct vm_fault *vmf)
2884{
2885 struct vm_area_struct *vma = vmf->vma;
2886 struct page *page = NULL, *swapcache;
2887 struct mem_cgroup *memcg;
2888 swp_entry_t entry;
2889 pte_t pte;
2890 int locked;
2891 int exclusive = 0;
2892 int ret = 0;
2893
2894 if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
2895 goto out;
2896
2897 entry = pte_to_swp_entry(vmf->orig_pte);
2898 if (unlikely(non_swap_entry(entry))) {
2899 if (is_migration_entry(entry)) {
2900 migration_entry_wait(vma->vm_mm, vmf->pmd,
2901 vmf->address);
2902 } else if (is_device_private_entry(entry)) {
2903
2904
2905
2906
2907
2908 ret = device_private_entry_fault(vma, vmf->address, entry,
2909 vmf->flags, vmf->pmd);
2910 } else if (is_hwpoison_entry(entry)) {
2911 ret = VM_FAULT_HWPOISON;
2912 } else {
2913 print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
2914 ret = VM_FAULT_SIGBUS;
2915 }
2916 goto out;
2917 }
2918
2919
2920 delayacct_set_flag(DELAYACCT_PF_SWAPIN);
2921 page = lookup_swap_cache(entry, vma, vmf->address);
2922 swapcache = page;
2923
2924 if (!page) {
2925 struct swap_info_struct *si = swp_swap_info(entry);
2926
2927 if (si->flags & SWP_SYNCHRONOUS_IO &&
2928 __swap_count(si, entry) == 1) {
2929
2930 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
2931 vmf->address);
2932 if (page) {
2933 __SetPageLocked(page);
2934 __SetPageSwapBacked(page);
2935 set_page_private(page, entry.val);
2936 lru_cache_add_anon(page);
2937 swap_readpage(page, true);
2938 }
2939 } else {
2940 page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
2941 vmf);
2942 swapcache = page;
2943 }
2944
2945 if (!page) {
2946
2947
2948
2949
2950 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
2951 vmf->address, &vmf->ptl);
2952 if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
2953 ret = VM_FAULT_OOM;
2954 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2955 goto unlock;
2956 }
2957
2958
2959 ret = VM_FAULT_MAJOR;
2960 count_vm_event(PGMAJFAULT);
2961 count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
2962 } else if (PageHWPoison(page)) {
2963
2964
2965
2966
2967 ret = VM_FAULT_HWPOISON;
2968 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2969 goto out_release;
2970 }
2971
2972 locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
2973
2974 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2975 if (!locked) {
2976 ret |= VM_FAULT_RETRY;
2977 goto out_release;
2978 }
2979
2980
2981
2982
2983
2984
2985
2986 if (unlikely((!PageSwapCache(page) ||
2987 page_private(page) != entry.val)) && swapcache)
2988 goto out_page;
2989
2990 page = ksm_might_need_to_copy(page, vma, vmf->address);
2991 if (unlikely(!page)) {
2992 ret = VM_FAULT_OOM;
2993 page = swapcache;
2994 goto out_page;
2995 }
2996
2997 if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL,
2998 &memcg, false)) {
2999 ret = VM_FAULT_OOM;
3000 goto out_page;
3001 }
3002
3003
3004
3005
3006 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
3007 &vmf->ptl);
3008 if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte)))
3009 goto out_nomap;
3010
3011 if (unlikely(!PageUptodate(page))) {
3012 ret = VM_FAULT_SIGBUS;
3013 goto out_nomap;
3014 }
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
3027 dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
3028 pte = mk_pte(page, vma->vm_page_prot);
3029 if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
3030 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
3031 vmf->flags &= ~FAULT_FLAG_WRITE;
3032 ret |= VM_FAULT_WRITE;
3033 exclusive = RMAP_EXCLUSIVE;
3034 }
3035 flush_icache_page(vma, page);
3036 if (pte_swp_soft_dirty(vmf->orig_pte))
3037 pte = pte_mksoft_dirty(pte);
3038 set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
3039 arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
3040 vmf->orig_pte = pte;
3041
3042
3043 if (unlikely(page != swapcache && swapcache)) {
3044 page_add_new_anon_rmap(page, vma, vmf->address, false);
3045 mem_cgroup_commit_charge(page, memcg, false, false);
3046 lru_cache_add_active_or_unevictable(page, vma);
3047 } else {
3048 do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
3049 mem_cgroup_commit_charge(page, memcg, true, false);
3050 activate_page(page);
3051 }
3052
3053 swap_free(entry);
3054 if (mem_cgroup_swap_full(page) ||
3055 (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
3056 try_to_free_swap(page);
3057 unlock_page(page);
3058 if (page != swapcache && swapcache) {
3059
3060
3061
3062
3063
3064
3065
3066
3067 unlock_page(swapcache);
3068 put_page(swapcache);
3069 }
3070
3071 if (vmf->flags & FAULT_FLAG_WRITE) {
3072 ret |= do_wp_page(vmf);
3073 if (ret & VM_FAULT_ERROR)
3074 ret &= VM_FAULT_ERROR;
3075 goto out;
3076 }
3077
3078
3079 update_mmu_cache(vma, vmf->address, vmf->pte);
3080unlock:
3081 pte_unmap_unlock(vmf->pte, vmf->ptl);
3082out:
3083 return ret;
3084out_nomap:
3085 mem_cgroup_cancel_charge(page, memcg, false);
3086 pte_unmap_unlock(vmf->pte, vmf->ptl);
3087out_page:
3088 unlock_page(page);
3089out_release:
3090 put_page(page);
3091 if (page != swapcache && swapcache) {
3092 unlock_page(swapcache);
3093 put_page(swapcache);
3094 }
3095 return ret;
3096}
3097
3098
3099
3100
3101
3102
3103static int do_anonymous_page(struct vm_fault *vmf)
3104{
3105 struct vm_area_struct *vma = vmf->vma;
3106 struct mem_cgroup *memcg;
3107 struct page *page;
3108 int ret = 0;
3109 pte_t entry;
3110
3111
3112 if (vma->vm_flags & VM_SHARED)
3113 return VM_FAULT_SIGBUS;
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125 if (pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))
3126 return VM_FAULT_OOM;
3127
3128
3129 if (unlikely(pmd_trans_unstable(vmf->pmd)))
3130 return 0;
3131
3132
3133 if (!(vmf->flags & FAULT_FLAG_WRITE) &&
3134 !mm_forbids_zeropage(vma->vm_mm)) {
3135 entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address),
3136 vma->vm_page_prot));
3137 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
3138 vmf->address, &vmf->ptl);
3139 if (!pte_none(*vmf->pte))
3140 goto unlock;
3141 ret = check_stable_address_space(vma->vm_mm);
3142 if (ret)
3143 goto unlock;
3144
3145 if (userfaultfd_missing(vma)) {
3146 pte_unmap_unlock(vmf->pte, vmf->ptl);
3147 return handle_userfault(vmf, VM_UFFD_MISSING);
3148 }
3149 goto setpte;
3150 }
3151
3152
3153 if (unlikely(anon_vma_prepare(vma)))
3154 goto oom;
3155 page = alloc_zeroed_user_highpage_movable(vma, vmf->address);
3156 if (!page)
3157 goto oom;
3158
3159 if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false))
3160 goto oom_free_page;
3161
3162
3163
3164
3165
3166
3167 __SetPageUptodate(page);
3168
3169 entry = mk_pte(page, vma->vm_page_prot);
3170 if (vma->vm_flags & VM_WRITE)
3171 entry = pte_mkwrite(pte_mkdirty(entry));
3172
3173 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
3174 &vmf->ptl);
3175 if (!pte_none(*vmf->pte))
3176 goto release;
3177
3178 ret = check_stable_address_space(vma->vm_mm);
3179 if (ret)
3180 goto release;
3181
3182
3183 if (userfaultfd_missing(vma)) {
3184 pte_unmap_unlock(vmf->pte, vmf->ptl);
3185 mem_cgroup_cancel_charge(page, memcg, false);
3186 put_page(page);
3187 return handle_userfault(vmf, VM_UFFD_MISSING);
3188 }
3189
3190 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
3191 page_add_new_anon_rmap(page, vma, vmf->address, false);
3192 mem_cgroup_commit_charge(page, memcg, false, false);
3193 lru_cache_add_active_or_unevictable(page, vma);
3194setpte:
3195 set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
3196
3197
3198 update_mmu_cache(vma, vmf->address, vmf->pte);
3199unlock:
3200 pte_unmap_unlock(vmf->pte, vmf->ptl);
3201 return ret;
3202release:
3203 mem_cgroup_cancel_charge(page, memcg, false);
3204 put_page(page);
3205 goto unlock;
3206oom_free_page:
3207 put_page(page);
3208oom:
3209 return VM_FAULT_OOM;
3210}
3211
3212
3213
3214
3215
3216
3217static int __do_fault(struct vm_fault *vmf)
3218{
3219 struct vm_area_struct *vma = vmf->vma;
3220 int ret;
3221
3222 ret = vma->vm_ops->fault(vmf);
3223 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
3224 VM_FAULT_DONE_COW)))
3225 return ret;
3226
3227 if (unlikely(PageHWPoison(vmf->page))) {
3228 if (ret & VM_FAULT_LOCKED)
3229 unlock_page(vmf->page);
3230 put_page(vmf->page);
3231 vmf->page = NULL;
3232 return VM_FAULT_HWPOISON;
3233 }
3234
3235 if (unlikely(!(ret & VM_FAULT_LOCKED)))
3236 lock_page(vmf->page);
3237 else
3238 VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page);
3239
3240 return ret;
3241}
3242
3243
3244
3245
3246
3247
3248
3249static int pmd_devmap_trans_unstable(pmd_t *pmd)
3250{
3251 return pmd_devmap(*pmd) || pmd_trans_unstable(pmd);
3252}
3253
3254static int pte_alloc_one_map(struct vm_fault *vmf)
3255{
3256 struct vm_area_struct *vma = vmf->vma;
3257
3258 if (!pmd_none(*vmf->pmd))
3259 goto map_pte;
3260 if (vmf->prealloc_pte) {
3261 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
3262 if (unlikely(!pmd_none(*vmf->pmd))) {
3263 spin_unlock(vmf->ptl);
3264 goto map_pte;
3265 }
3266
3267 mm_inc_nr_ptes(vma->vm_mm);
3268 pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
3269 spin_unlock(vmf->ptl);
3270 vmf->prealloc_pte = NULL;
3271 } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))) {
3272 return VM_FAULT_OOM;
3273 }
3274map_pte:
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286 if (pmd_devmap_trans_unstable(vmf->pmd))
3287 return VM_FAULT_NOPAGE;
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
3299 &vmf->ptl);
3300 return 0;
3301}
3302
3303#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
3304
3305#define HPAGE_CACHE_INDEX_MASK (HPAGE_PMD_NR - 1)
3306static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
3307 unsigned long haddr)
3308{
3309 if (((vma->vm_start >> PAGE_SHIFT) & HPAGE_CACHE_INDEX_MASK) !=
3310 (vma->vm_pgoff & HPAGE_CACHE_INDEX_MASK))
3311 return false;
3312 if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
3313 return false;
3314 return true;
3315}
3316
3317static void deposit_prealloc_pte(struct vm_fault *vmf)
3318{
3319 struct vm_area_struct *vma = vmf->vma;
3320
3321 pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
3322
3323
3324
3325
3326 mm_inc_nr_ptes(vma->vm_mm);
3327 vmf->prealloc_pte = NULL;
3328}
3329
3330static int do_set_pmd(struct vm_fault *vmf, struct page *page)
3331{
3332 struct vm_area_struct *vma = vmf->vma;
3333 bool write = vmf->flags & FAULT_FLAG_WRITE;
3334 unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
3335 pmd_t entry;
3336 int i, ret;
3337
3338 if (!transhuge_vma_suitable(vma, haddr))
3339 return VM_FAULT_FALLBACK;
3340
3341 ret = VM_FAULT_FALLBACK;
3342 page = compound_head(page);
3343
3344
3345
3346
3347
3348 if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
3349 vmf->prealloc_pte = pte_alloc_one(vma->vm_mm, vmf->address);
3350 if (!vmf->prealloc_pte)
3351 return VM_FAULT_OOM;
3352 smp_wmb();
3353 }
3354
3355 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
3356 if (unlikely(!pmd_none(*vmf->pmd)))
3357 goto out;
3358
3359 for (i = 0; i < HPAGE_PMD_NR; i++)
3360 flush_icache_page(vma, page + i);
3361
3362 entry = mk_huge_pmd(page, vma->vm_page_prot);
3363 if (write)
3364 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
3365
3366 add_mm_counter(vma->vm_mm, MM_FILEPAGES, HPAGE_PMD_NR);
3367 page_add_file_rmap(page, true);
3368
3369
3370
3371 if (arch_needs_pgtable_deposit())
3372 deposit_prealloc_pte(vmf);
3373
3374 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
3375
3376 update_mmu_cache_pmd(vma, haddr, vmf->pmd);
3377
3378
3379 ret = 0;
3380 count_vm_event(THP_FILE_MAPPED);
3381out:
3382 spin_unlock(vmf->ptl);
3383 return ret;
3384}
3385#else
3386static int do_set_pmd(struct vm_fault *vmf, struct page *page)
3387{
3388 BUILD_BUG();
3389 return 0;
3390}
3391#endif
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
3408 struct page *page)
3409{
3410 struct vm_area_struct *vma = vmf->vma;
3411 bool write = vmf->flags & FAULT_FLAG_WRITE;
3412 pte_t entry;
3413 int ret;
3414
3415 if (pmd_none(*vmf->pmd) && PageTransCompound(page) &&
3416 IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) {
3417
3418 VM_BUG_ON_PAGE(memcg, page);
3419
3420 ret = do_set_pmd(vmf, page);
3421 if (ret != VM_FAULT_FALLBACK)
3422 return ret;
3423 }
3424
3425 if (!vmf->pte) {
3426 ret = pte_alloc_one_map(vmf);
3427 if (ret)
3428 return ret;
3429 }
3430
3431
3432 if (unlikely(!pte_none(*vmf->pte)))
3433 return VM_FAULT_NOPAGE;
3434
3435 flush_icache_page(vma, page);
3436 entry = mk_pte(page, vma->vm_page_prot);
3437 if (write)
3438 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
3439
3440 if (write && !(vma->vm_flags & VM_SHARED)) {
3441 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
3442 page_add_new_anon_rmap(page, vma, vmf->address, false);
3443 mem_cgroup_commit_charge(page, memcg, false, false);
3444 lru_cache_add_active_or_unevictable(page, vma);
3445 } else {
3446 inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
3447 page_add_file_rmap(page, false);
3448 }
3449 set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
3450
3451
3452 update_mmu_cache(vma, vmf->address, vmf->pte);
3453
3454 return 0;
3455}
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472int finish_fault(struct vm_fault *vmf)
3473{
3474 struct page *page;
3475 int ret = 0;
3476
3477
3478 if ((vmf->flags & FAULT_FLAG_WRITE) &&
3479 !(vmf->vma->vm_flags & VM_SHARED))
3480 page = vmf->cow_page;
3481 else
3482 page = vmf->page;
3483
3484
3485
3486
3487
3488 if (!(vmf->vma->vm_flags & VM_SHARED))
3489 ret = check_stable_address_space(vmf->vma->vm_mm);
3490 if (!ret)
3491 ret = alloc_set_pte(vmf, vmf->memcg, page);
3492 if (vmf->pte)
3493 pte_unmap_unlock(vmf->pte, vmf->ptl);
3494 return ret;
3495}
3496
3497static unsigned long fault_around_bytes __read_mostly =
3498 rounddown_pow_of_two(65536);
3499
3500#ifdef CONFIG_DEBUG_FS
3501static int fault_around_bytes_get(void *data, u64 *val)
3502{
3503 *val = fault_around_bytes;
3504 return 0;
3505}
3506
3507
3508
3509
3510
3511static int fault_around_bytes_set(void *data, u64 val)
3512{
3513 if (val / PAGE_SIZE > PTRS_PER_PTE)
3514 return -EINVAL;
3515 if (val > PAGE_SIZE)
3516 fault_around_bytes = rounddown_pow_of_two(val);
3517 else
3518 fault_around_bytes = PAGE_SIZE;
3519 return 0;
3520}
3521DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops,
3522 fault_around_bytes_get, fault_around_bytes_set, "%llu\n");
3523
3524static int __init fault_around_debugfs(void)
3525{
3526 void *ret;
3527
3528 ret = debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL,
3529 &fault_around_bytes_fops);
3530 if (!ret)
3531 pr_warn("Failed to create fault_around_bytes in debugfs");
3532 return 0;
3533}
3534late_initcall(fault_around_debugfs);
3535#endif
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561static int do_fault_around(struct vm_fault *vmf)
3562{
3563 unsigned long address = vmf->address, nr_pages, mask;
3564 pgoff_t start_pgoff = vmf->pgoff;
3565 pgoff_t end_pgoff;
3566 int off, ret = 0;
3567
3568 nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
3569 mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
3570
3571 vmf->address = max(address & mask, vmf->vma->vm_start);
3572 off = ((address - vmf->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
3573 start_pgoff -= off;
3574
3575
3576
3577
3578
3579 end_pgoff = start_pgoff -
3580 ((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
3581 PTRS_PER_PTE - 1;
3582 end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1,
3583 start_pgoff + nr_pages - 1);
3584
3585 if (pmd_none(*vmf->pmd)) {
3586 vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm,
3587 vmf->address);
3588 if (!vmf->prealloc_pte)
3589 goto out;
3590 smp_wmb();
3591 }
3592
3593 vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);
3594
3595
3596 if (pmd_trans_huge(*vmf->pmd)) {
3597 ret = VM_FAULT_NOPAGE;
3598 goto out;
3599 }
3600
3601
3602 if (!vmf->pte)
3603 goto out;
3604
3605
3606 vmf->pte -= (vmf->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT);
3607 if (!pte_none(*vmf->pte))
3608 ret = VM_FAULT_NOPAGE;
3609 pte_unmap_unlock(vmf->pte, vmf->ptl);
3610out:
3611 vmf->address = address;
3612 vmf->pte = NULL;
3613 return ret;
3614}
3615
3616static int do_read_fault(struct vm_fault *vmf)
3617{
3618 struct vm_area_struct *vma = vmf->vma;
3619 int ret = 0;
3620
3621
3622
3623
3624
3625
3626 if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
3627 ret = do_fault_around(vmf);
3628 if (ret)
3629 return ret;
3630 }
3631
3632 ret = __do_fault(vmf);
3633 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3634 return ret;
3635
3636 ret |= finish_fault(vmf);
3637 unlock_page(vmf->page);
3638 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3639 put_page(vmf->page);
3640 return ret;
3641}
3642
3643static int do_cow_fault(struct vm_fault *vmf)
3644{
3645 struct vm_area_struct *vma = vmf->vma;
3646 int ret;
3647
3648 if (unlikely(anon_vma_prepare(vma)))
3649 return VM_FAULT_OOM;
3650
3651 vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address);
3652 if (!vmf->cow_page)
3653 return VM_FAULT_OOM;
3654
3655 if (mem_cgroup_try_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL,
3656 &vmf->memcg, false)) {
3657 put_page(vmf->cow_page);
3658 return VM_FAULT_OOM;
3659 }
3660
3661 ret = __do_fault(vmf);
3662 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3663 goto uncharge_out;
3664 if (ret & VM_FAULT_DONE_COW)
3665 return ret;
3666
3667 copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma);
3668 __SetPageUptodate(vmf->cow_page);
3669
3670 ret |= finish_fault(vmf);
3671 unlock_page(vmf->page);
3672 put_page(vmf->page);
3673 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3674 goto uncharge_out;
3675 return ret;
3676uncharge_out:
3677 mem_cgroup_cancel_charge(vmf->cow_page, vmf->memcg, false);
3678 put_page(vmf->cow_page);
3679 return ret;
3680}
3681
3682static int do_shared_fault(struct vm_fault *vmf)
3683{
3684 struct vm_area_struct *vma = vmf->vma;
3685 int ret, tmp;
3686
3687 ret = __do_fault(vmf);
3688 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3689 return ret;
3690
3691
3692
3693
3694
3695 if (vma->vm_ops->page_mkwrite) {
3696 unlock_page(vmf->page);
3697 tmp = do_page_mkwrite(vmf);
3698 if (unlikely(!tmp ||
3699 (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
3700 put_page(vmf->page);
3701 return tmp;
3702 }
3703 }
3704
3705 ret |= finish_fault(vmf);
3706 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
3707 VM_FAULT_RETRY))) {
3708 unlock_page(vmf->page);
3709 put_page(vmf->page);
3710 return ret;
3711 }
3712
3713 fault_dirty_shared_page(vma, vmf->page);
3714 return ret;
3715}
3716
3717
3718
3719
3720
3721
3722
3723static int do_fault(struct vm_fault *vmf)
3724{
3725 struct vm_area_struct *vma = vmf->vma;
3726 int ret;
3727
3728
3729 if (!vma->vm_ops->fault)
3730 ret = VM_FAULT_SIGBUS;
3731 else if (!(vmf->flags & FAULT_FLAG_WRITE))
3732 ret = do_read_fault(vmf);
3733 else if (!(vma->vm_flags & VM_SHARED))
3734 ret = do_cow_fault(vmf);
3735 else
3736 ret = do_shared_fault(vmf);
3737
3738
3739 if (vmf->prealloc_pte) {
3740 pte_free(vma->vm_mm, vmf->prealloc_pte);
3741 vmf->prealloc_pte = NULL;
3742 }
3743 return ret;
3744}
3745
3746static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
3747 unsigned long addr, int page_nid,
3748 int *flags)
3749{
3750 get_page(page);
3751
3752 count_vm_numa_event(NUMA_HINT_FAULTS);
3753 if (page_nid == numa_node_id()) {
3754 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
3755 *flags |= TNF_FAULT_LOCAL;
3756 }
3757
3758 return mpol_misplaced(page, vma, addr);
3759}
3760
3761static int do_numa_page(struct vm_fault *vmf)
3762{
3763 struct vm_area_struct *vma = vmf->vma;
3764 struct page *page = NULL;
3765 int page_nid = -1;
3766 int last_cpupid;
3767 int target_nid;
3768 bool migrated = false;
3769 pte_t pte;
3770 bool was_writable = pte_savedwrite(vmf->orig_pte);
3771 int flags = 0;
3772
3773
3774
3775
3776
3777
3778 vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd);
3779 spin_lock(vmf->ptl);
3780 if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
3781 pte_unmap_unlock(vmf->pte, vmf->ptl);
3782 goto out;
3783 }
3784
3785
3786
3787
3788
3789 pte = ptep_modify_prot_start(vma->vm_mm, vmf->address, vmf->pte);
3790 pte = pte_modify(pte, vma->vm_page_prot);
3791 pte = pte_mkyoung(pte);
3792 if (was_writable)
3793 pte = pte_mkwrite(pte);
3794 ptep_modify_prot_commit(vma->vm_mm, vmf->address, vmf->pte, pte);
3795 update_mmu_cache(vma, vmf->address, vmf->pte);
3796
3797 page = vm_normal_page(vma, vmf->address, pte);
3798 if (!page) {
3799 pte_unmap_unlock(vmf->pte, vmf->ptl);
3800 return 0;
3801 }
3802
3803
3804 if (PageCompound(page)) {
3805 pte_unmap_unlock(vmf->pte, vmf->ptl);
3806 return 0;
3807 }
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817 if (!pte_write(pte))
3818 flags |= TNF_NO_GROUP;
3819
3820
3821
3822
3823
3824 if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
3825 flags |= TNF_SHARED;
3826
3827 last_cpupid = page_cpupid_last(page);
3828 page_nid = page_to_nid(page);
3829 target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
3830 &flags);
3831 pte_unmap_unlock(vmf->pte, vmf->ptl);
3832 if (target_nid == -1) {
3833 put_page(page);
3834 goto out;
3835 }
3836
3837
3838 migrated = migrate_misplaced_page(page, vma, target_nid);
3839 if (migrated) {
3840 page_nid = target_nid;
3841 flags |= TNF_MIGRATED;
3842 } else
3843 flags |= TNF_MIGRATE_FAIL;
3844
3845out:
3846 if (page_nid != -1)
3847 task_numa_fault(last_cpupid, page_nid, 1, flags);
3848 return 0;
3849}
3850
3851static inline int create_huge_pmd(struct vm_fault *vmf)
3852{
3853 if (vma_is_anonymous(vmf->vma))
3854 return do_huge_pmd_anonymous_page(vmf);
3855 if (vmf->vma->vm_ops->huge_fault)
3856 return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
3857 return VM_FAULT_FALLBACK;
3858}
3859
3860
3861static inline int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
3862{
3863 if (vma_is_anonymous(vmf->vma))
3864 return do_huge_pmd_wp_page(vmf, orig_pmd);
3865 if (vmf->vma->vm_ops->huge_fault)
3866 return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
3867
3868
3869 VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);
3870 __split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);
3871
3872 return VM_FAULT_FALLBACK;
3873}
3874
3875static inline bool vma_is_accessible(struct vm_area_struct *vma)
3876{
3877 return vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE);
3878}
3879
3880static int create_huge_pud(struct vm_fault *vmf)
3881{
3882#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3883
3884 if (vma_is_anonymous(vmf->vma))
3885 return VM_FAULT_FALLBACK;
3886 if (vmf->vma->vm_ops->huge_fault)
3887 return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
3888#endif
3889 return VM_FAULT_FALLBACK;
3890}
3891
3892static int wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
3893{
3894#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3895
3896 if (vma_is_anonymous(vmf->vma))
3897 return VM_FAULT_FALLBACK;
3898 if (vmf->vma->vm_ops->huge_fault)
3899 return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
3900#endif
3901 return VM_FAULT_FALLBACK;
3902}
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919static int handle_pte_fault(struct vm_fault *vmf)
3920{
3921 pte_t entry;
3922
3923 if (unlikely(pmd_none(*vmf->pmd))) {
3924
3925
3926
3927
3928
3929
3930 vmf->pte = NULL;
3931 } else {
3932
3933 if (pmd_devmap_trans_unstable(vmf->pmd))
3934 return 0;
3935
3936
3937
3938
3939
3940
3941 vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
3942 vmf->orig_pte = *vmf->pte;
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952 barrier();
3953 if (pte_none(vmf->orig_pte)) {
3954 pte_unmap(vmf->pte);
3955 vmf->pte = NULL;
3956 }
3957 }
3958
3959 if (!vmf->pte) {
3960 if (vma_is_anonymous(vmf->vma))
3961 return do_anonymous_page(vmf);
3962 else
3963 return do_fault(vmf);
3964 }
3965
3966 if (!pte_present(vmf->orig_pte))
3967 return do_swap_page(vmf);
3968
3969 if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
3970 return do_numa_page(vmf);
3971
3972 vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
3973 spin_lock(vmf->ptl);
3974 entry = vmf->orig_pte;
3975 if (unlikely(!pte_same(*vmf->pte, entry)))
3976 goto unlock;
3977 if (vmf->flags & FAULT_FLAG_WRITE) {
3978 if (!pte_write(entry))
3979 return do_wp_page(vmf);
3980 entry = pte_mkdirty(entry);
3981 }
3982 entry = pte_mkyoung(entry);
3983 if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
3984 vmf->flags & FAULT_FLAG_WRITE)) {
3985 update_mmu_cache(vmf->vma, vmf->address, vmf->pte);
3986 } else {
3987
3988
3989
3990
3991
3992
3993 if (vmf->flags & FAULT_FLAG_WRITE)
3994 flush_tlb_fix_spurious_fault(vmf->vma, vmf->address);
3995 }
3996unlock:
3997 pte_unmap_unlock(vmf->pte, vmf->ptl);
3998 return 0;
3999}
4000
4001
4002
4003
4004
4005
4006
4007static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
4008 unsigned int flags)
4009{
4010 struct vm_fault vmf = {
4011 .vma = vma,
4012 .address = address & PAGE_MASK,
4013 .flags = flags,
4014 .pgoff = linear_page_index(vma, address),
4015 .gfp_mask = __get_fault_gfp_mask(vma),
4016 };
4017 unsigned int dirty = flags & FAULT_FLAG_WRITE;
4018 struct mm_struct *mm = vma->vm_mm;
4019 pgd_t *pgd;
4020 p4d_t *p4d;
4021 int ret;
4022
4023 pgd = pgd_offset(mm, address);
4024 p4d = p4d_alloc(mm, pgd, address);
4025 if (!p4d)
4026 return VM_FAULT_OOM;
4027
4028 vmf.pud = pud_alloc(mm, p4d, address);
4029 if (!vmf.pud)
4030 return VM_FAULT_OOM;
4031 if (pud_none(*vmf.pud) && transparent_hugepage_enabled(vma)) {
4032 ret = create_huge_pud(&vmf);
4033 if (!(ret & VM_FAULT_FALLBACK))
4034 return ret;
4035 } else {
4036 pud_t orig_pud = *vmf.pud;
4037
4038 barrier();
4039 if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {
4040
4041
4042
4043 if (dirty && !pud_write(orig_pud)) {
4044 ret = wp_huge_pud(&vmf, orig_pud);
4045 if (!(ret & VM_FAULT_FALLBACK))
4046 return ret;
4047 } else {
4048 huge_pud_set_accessed(&vmf, orig_pud);
4049 return 0;
4050 }
4051 }
4052 }
4053
4054 vmf.pmd = pmd_alloc(mm, vmf.pud, address);
4055 if (!vmf.pmd)
4056 return VM_FAULT_OOM;
4057 if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) {
4058 ret = create_huge_pmd(&vmf);
4059 if (!(ret & VM_FAULT_FALLBACK))
4060 return ret;
4061 } else {
4062 pmd_t orig_pmd = *vmf.pmd;
4063
4064 barrier();
4065 if (unlikely(is_swap_pmd(orig_pmd))) {
4066 VM_BUG_ON(thp_migration_supported() &&
4067 !is_pmd_migration_entry(orig_pmd));
4068 if (is_pmd_migration_entry(orig_pmd))
4069 pmd_migration_entry_wait(mm, vmf.pmd);
4070 return 0;
4071 }
4072 if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
4073 if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
4074 return do_huge_pmd_numa_page(&vmf, orig_pmd);
4075
4076 if (dirty && !pmd_write(orig_pmd)) {
4077 ret = wp_huge_pmd(&vmf, orig_pmd);
4078 if (!(ret & VM_FAULT_FALLBACK))
4079 return ret;
4080 } else {
4081 huge_pmd_set_accessed(&vmf, orig_pmd);
4082 return 0;
4083 }
4084 }
4085 }
4086
4087 return handle_pte_fault(&vmf);
4088}
4089
4090
4091
4092
4093
4094
4095
4096int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
4097 unsigned int flags)
4098{
4099 int ret;
4100
4101 __set_current_state(TASK_RUNNING);
4102
4103 count_vm_event(PGFAULT);
4104 count_memcg_event_mm(vma->vm_mm, PGFAULT);
4105
4106
4107 check_sync_rss_stat(current);
4108
4109 if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
4110 flags & FAULT_FLAG_INSTRUCTION,
4111 flags & FAULT_FLAG_REMOTE))
4112 return VM_FAULT_SIGSEGV;
4113
4114
4115
4116
4117
4118 if (flags & FAULT_FLAG_USER)
4119 mem_cgroup_oom_enable();
4120
4121 if (unlikely(is_vm_hugetlb_page(vma)))
4122 ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
4123 else
4124 ret = __handle_mm_fault(vma, address, flags);
4125
4126 if (flags & FAULT_FLAG_USER) {
4127 mem_cgroup_oom_disable();
4128
4129
4130
4131
4132
4133
4134 if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
4135 mem_cgroup_oom_synchronize(false);
4136 }
4137
4138 return ret;
4139}
4140EXPORT_SYMBOL_GPL(handle_mm_fault);
4141
4142#ifndef __PAGETABLE_P4D_FOLDED
4143
4144
4145
4146
4147int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
4148{
4149 p4d_t *new = p4d_alloc_one(mm, address);
4150 if (!new)
4151 return -ENOMEM;
4152
4153 smp_wmb();
4154
4155 spin_lock(&mm->page_table_lock);
4156 if (pgd_present(*pgd))
4157 p4d_free(mm, new);
4158 else
4159 pgd_populate(mm, pgd, new);
4160 spin_unlock(&mm->page_table_lock);
4161 return 0;
4162}
4163#endif
4164
4165#ifndef __PAGETABLE_PUD_FOLDED
4166
4167
4168
4169
4170int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address)
4171{
4172 pud_t *new = pud_alloc_one(mm, address);
4173 if (!new)
4174 return -ENOMEM;
4175
4176 smp_wmb();
4177
4178 spin_lock(&mm->page_table_lock);
4179#ifndef __ARCH_HAS_5LEVEL_HACK
4180 if (!p4d_present(*p4d)) {
4181 mm_inc_nr_puds(mm);
4182 p4d_populate(mm, p4d, new);
4183 } else
4184 pud_free(mm, new);
4185#else
4186 if (!pgd_present(*p4d)) {
4187 mm_inc_nr_puds(mm);
4188 pgd_populate(mm, p4d, new);
4189 } else
4190 pud_free(mm, new);
4191#endif
4192 spin_unlock(&mm->page_table_lock);
4193 return 0;
4194}
4195#endif
4196
4197#ifndef __PAGETABLE_PMD_FOLDED
4198
4199
4200
4201
4202int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
4203{
4204 spinlock_t *ptl;
4205 pmd_t *new = pmd_alloc_one(mm, address);
4206 if (!new)
4207 return -ENOMEM;
4208
4209 smp_wmb();
4210
4211 ptl = pud_lock(mm, pud);
4212#ifndef __ARCH_HAS_4LEVEL_HACK
4213 if (!pud_present(*pud)) {
4214 mm_inc_nr_pmds(mm);
4215 pud_populate(mm, pud, new);
4216 } else
4217 pmd_free(mm, new);
4218#else
4219 if (!pgd_present(*pud)) {
4220 mm_inc_nr_pmds(mm);
4221 pgd_populate(mm, pud, new);
4222 } else
4223 pmd_free(mm, new);
4224#endif
4225 spin_unlock(ptl);
4226 return 0;
4227}
4228#endif
4229
4230static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
4231 unsigned long *start, unsigned long *end,
4232 pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
4233{
4234 pgd_t *pgd;
4235 p4d_t *p4d;
4236 pud_t *pud;
4237 pmd_t *pmd;
4238 pte_t *ptep;
4239
4240 pgd = pgd_offset(mm, address);
4241 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
4242 goto out;
4243
4244 p4d = p4d_offset(pgd, address);
4245 if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d)))
4246 goto out;
4247
4248 pud = pud_offset(p4d, address);
4249 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
4250 goto out;
4251
4252 pmd = pmd_offset(pud, address);
4253 VM_BUG_ON(pmd_trans_huge(*pmd));
4254
4255 if (pmd_huge(*pmd)) {
4256 if (!pmdpp)
4257 goto out;
4258
4259 if (start && end) {
4260 *start = address & PMD_MASK;
4261 *end = *start + PMD_SIZE;
4262 mmu_notifier_invalidate_range_start(mm, *start, *end);
4263 }
4264 *ptlp = pmd_lock(mm, pmd);
4265 if (pmd_huge(*pmd)) {
4266 *pmdpp = pmd;
4267 return 0;
4268 }
4269 spin_unlock(*ptlp);
4270 if (start && end)
4271 mmu_notifier_invalidate_range_end(mm, *start, *end);
4272 }
4273
4274 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
4275 goto out;
4276
4277 if (start && end) {
4278 *start = address & PAGE_MASK;
4279 *end = *start + PAGE_SIZE;
4280 mmu_notifier_invalidate_range_start(mm, *start, *end);
4281 }
4282 ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
4283 if (!pte_present(*ptep))
4284 goto unlock;
4285 *ptepp = ptep;
4286 return 0;
4287unlock:
4288 pte_unmap_unlock(ptep, *ptlp);
4289 if (start && end)
4290 mmu_notifier_invalidate_range_end(mm, *start, *end);
4291out:
4292 return -EINVAL;
4293}
4294
4295static inline int follow_pte(struct mm_struct *mm, unsigned long address,
4296 pte_t **ptepp, spinlock_t **ptlp)
4297{
4298 int res;
4299
4300
4301 (void) __cond_lock(*ptlp,
4302 !(res = __follow_pte_pmd(mm, address, NULL, NULL,
4303 ptepp, NULL, ptlp)));
4304 return res;
4305}
4306
4307int follow_pte_pmd(struct mm_struct *mm, unsigned long address,
4308 unsigned long *start, unsigned long *end,
4309 pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
4310{
4311 int res;
4312
4313
4314 (void) __cond_lock(*ptlp,
4315 !(res = __follow_pte_pmd(mm, address, start, end,
4316 ptepp, pmdpp, ptlp)));
4317 return res;
4318}
4319EXPORT_SYMBOL(follow_pte_pmd);
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331int follow_pfn(struct vm_area_struct *vma, unsigned long address,
4332 unsigned long *pfn)
4333{
4334 int ret = -EINVAL;
4335 spinlock_t *ptl;
4336 pte_t *ptep;
4337
4338 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
4339 return ret;
4340
4341 ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
4342 if (ret)
4343 return ret;
4344 *pfn = pte_pfn(*ptep);
4345 pte_unmap_unlock(ptep, ptl);
4346 return 0;
4347}
4348EXPORT_SYMBOL(follow_pfn);
4349
4350#ifdef CONFIG_HAVE_IOREMAP_PROT
4351int follow_phys(struct vm_area_struct *vma,
4352 unsigned long address, unsigned int flags,
4353 unsigned long *prot, resource_size_t *phys)
4354{
4355 int ret = -EINVAL;
4356 pte_t *ptep, pte;
4357 spinlock_t *ptl;
4358
4359 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
4360 goto out;
4361
4362 if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
4363 goto out;
4364 pte = *ptep;
4365
4366 if ((flags & FOLL_WRITE) && !pte_write(pte))
4367 goto unlock;
4368
4369 *prot = pgprot_val(pte_pgprot(pte));
4370 *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
4371
4372 ret = 0;
4373unlock:
4374 pte_unmap_unlock(ptep, ptl);
4375out:
4376 return ret;
4377}
4378
4379int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
4380 void *buf, int len, int write)
4381{
4382 resource_size_t phys_addr;
4383 unsigned long prot = 0;
4384 void __iomem *maddr;
4385 int offset = addr & (PAGE_SIZE-1);
4386
4387 if (follow_phys(vma, addr, write, &prot, &phys_addr))
4388 return -EINVAL;
4389
4390 maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
4391 if (write)
4392 memcpy_toio(maddr + offset, buf, len);
4393 else
4394 memcpy_fromio(buf, maddr + offset, len);
4395 iounmap(maddr);
4396
4397 return len;
4398}
4399EXPORT_SYMBOL_GPL(generic_access_phys);
4400#endif
4401
4402
4403
4404
4405
4406int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
4407 unsigned long addr, void *buf, int len, unsigned int gup_flags)
4408{
4409 struct vm_area_struct *vma;
4410 void *old_buf = buf;
4411 int write = gup_flags & FOLL_WRITE;
4412
4413 down_read(&mm->mmap_sem);
4414
4415 while (len) {
4416 int bytes, ret, offset;
4417 void *maddr;
4418 struct page *page = NULL;
4419
4420 ret = get_user_pages_remote(tsk, mm, addr, 1,
4421 gup_flags, &page, &vma, NULL);
4422 if (ret <= 0) {
4423#ifndef CONFIG_HAVE_IOREMAP_PROT
4424 break;
4425#else
4426
4427
4428
4429
4430 vma = find_vma(mm, addr);
4431 if (!vma || vma->vm_start > addr)
4432 break;
4433 if (vma->vm_ops && vma->vm_ops->access)
4434 ret = vma->vm_ops->access(vma, addr, buf,
4435 len, write);
4436 if (ret <= 0)
4437 break;
4438 bytes = ret;
4439#endif
4440 } else {
4441 bytes = len;
4442 offset = addr & (PAGE_SIZE-1);
4443 if (bytes > PAGE_SIZE-offset)
4444 bytes = PAGE_SIZE-offset;
4445
4446 maddr = kmap(page);
4447 if (write) {
4448 copy_to_user_page(vma, page, addr,
4449 maddr + offset, buf, bytes);
4450 set_page_dirty_lock(page);
4451 } else {
4452 copy_from_user_page(vma, page, addr,
4453 buf, maddr + offset, bytes);
4454 }
4455 kunmap(page);
4456 put_page(page);
4457 }
4458 len -= bytes;
4459 buf += bytes;
4460 addr += bytes;
4461 }
4462 up_read(&mm->mmap_sem);
4463
4464 return buf - old_buf;
4465}
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477int access_remote_vm(struct mm_struct *mm, unsigned long addr,
4478 void *buf, int len, unsigned int gup_flags)
4479{
4480 return __access_remote_vm(NULL, mm, addr, buf, len, gup_flags);
4481}
4482
4483
4484
4485
4486
4487
4488int access_process_vm(struct task_struct *tsk, unsigned long addr,
4489 void *buf, int len, unsigned int gup_flags)
4490{
4491 struct mm_struct *mm;
4492 int ret;
4493
4494 mm = get_task_mm(tsk);
4495 if (!mm)
4496 return 0;
4497
4498 ret = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags);
4499
4500 mmput(mm);
4501
4502 return ret;
4503}
4504EXPORT_SYMBOL_GPL(access_process_vm);
4505
4506
4507
4508
4509void print_vma_addr(char *prefix, unsigned long ip)
4510{
4511 struct mm_struct *mm = current->mm;
4512 struct vm_area_struct *vma;
4513
4514
4515
4516
4517 if (!down_read_trylock(&mm->mmap_sem))
4518 return;
4519
4520 vma = find_vma(mm, ip);
4521 if (vma && vma->vm_file) {
4522 struct file *f = vma->vm_file;
4523 char *buf = (char *)__get_free_page(GFP_NOWAIT);
4524 if (buf) {
4525 char *p;
4526
4527 p = file_path(f, buf, PAGE_SIZE);
4528 if (IS_ERR(p))
4529 p = "?";
4530 printk("%s%s[%lx+%lx]", prefix, kbasename(p),
4531 vma->vm_start,
4532 vma->vm_end - vma->vm_start);
4533 free_page((unsigned long)buf);
4534 }
4535 }
4536 up_read(&mm->mmap_sem);
4537}
4538
4539#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
4540void __might_fault(const char *file, int line)
4541{
4542
4543
4544
4545
4546
4547
4548 if (uaccess_kernel())
4549 return;
4550 if (pagefault_disabled())
4551 return;
4552 __might_sleep(file, line, 0);
4553#if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
4554 if (current->mm)
4555 might_lock_read(¤t->mm->mmap_sem);
4556#endif
4557}
4558EXPORT_SYMBOL(__might_fault);
4559#endif
4560
4561#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
4562static void clear_gigantic_page(struct page *page,
4563 unsigned long addr,
4564 unsigned int pages_per_huge_page)
4565{
4566 int i;
4567 struct page *p = page;
4568
4569 might_sleep();
4570 for (i = 0; i < pages_per_huge_page;
4571 i++, p = mem_map_next(p, page, i)) {
4572 cond_resched();
4573 clear_user_highpage(p, addr + i * PAGE_SIZE);
4574 }
4575}
4576void clear_huge_page(struct page *page,
4577 unsigned long addr_hint, unsigned int pages_per_huge_page)
4578{
4579 int i, n, base, l;
4580 unsigned long addr = addr_hint &
4581 ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
4582
4583 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
4584 clear_gigantic_page(page, addr, pages_per_huge_page);
4585 return;
4586 }
4587
4588
4589 might_sleep();
4590 n = (addr_hint - addr) / PAGE_SIZE;
4591 if (2 * n <= pages_per_huge_page) {
4592
4593 base = 0;
4594 l = n;
4595
4596 for (i = pages_per_huge_page - 1; i >= 2 * n; i--) {
4597 cond_resched();
4598 clear_user_highpage(page + i, addr + i * PAGE_SIZE);
4599 }
4600 } else {
4601
4602 base = pages_per_huge_page - 2 * (pages_per_huge_page - n);
4603 l = pages_per_huge_page - n;
4604
4605 for (i = 0; i < base; i++) {
4606 cond_resched();
4607 clear_user_highpage(page + i, addr + i * PAGE_SIZE);
4608 }
4609 }
4610
4611
4612
4613
4614 for (i = 0; i < l; i++) {
4615 int left_idx = base + i;
4616 int right_idx = base + 2 * l - 1 - i;
4617
4618 cond_resched();
4619 clear_user_highpage(page + left_idx,
4620 addr + left_idx * PAGE_SIZE);
4621 cond_resched();
4622 clear_user_highpage(page + right_idx,
4623 addr + right_idx * PAGE_SIZE);
4624 }
4625}
4626
4627static void copy_user_gigantic_page(struct page *dst, struct page *src,
4628 unsigned long addr,
4629 struct vm_area_struct *vma,
4630 unsigned int pages_per_huge_page)
4631{
4632 int i;
4633 struct page *dst_base = dst;
4634 struct page *src_base = src;
4635
4636 for (i = 0; i < pages_per_huge_page; ) {
4637 cond_resched();
4638 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
4639
4640 i++;
4641 dst = mem_map_next(dst, dst_base, i);
4642 src = mem_map_next(src, src_base, i);
4643 }
4644}
4645
4646void copy_user_huge_page(struct page *dst, struct page *src,
4647 unsigned long addr, struct vm_area_struct *vma,
4648 unsigned int pages_per_huge_page)
4649{
4650 int i;
4651
4652 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
4653 copy_user_gigantic_page(dst, src, addr, vma,
4654 pages_per_huge_page);
4655 return;
4656 }
4657
4658 might_sleep();
4659 for (i = 0; i < pages_per_huge_page; i++) {
4660 cond_resched();
4661 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
4662 }
4663}
4664
4665long copy_huge_page_from_user(struct page *dst_page,
4666 const void __user *usr_src,
4667 unsigned int pages_per_huge_page,
4668 bool allow_pagefault)
4669{
4670 void *src = (void *)usr_src;
4671 void *page_kaddr;
4672 unsigned long i, rc = 0;
4673 unsigned long ret_val = pages_per_huge_page * PAGE_SIZE;
4674
4675 for (i = 0; i < pages_per_huge_page; i++) {
4676 if (allow_pagefault)
4677 page_kaddr = kmap(dst_page + i);
4678 else
4679 page_kaddr = kmap_atomic(dst_page + i);
4680 rc = copy_from_user(page_kaddr,
4681 (const void __user *)(src + i * PAGE_SIZE),
4682 PAGE_SIZE);
4683 if (allow_pagefault)
4684 kunmap(dst_page + i);
4685 else
4686 kunmap_atomic(page_kaddr);
4687
4688 ret_val -= (PAGE_SIZE - rc);
4689 if (rc)
4690 break;
4691
4692 cond_resched();
4693 }
4694 return ret_val;
4695}
4696#endif
4697
4698#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS
4699
4700static struct kmem_cache *page_ptl_cachep;
4701
4702void __init ptlock_cache_init(void)
4703{
4704 page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0,
4705 SLAB_PANIC, NULL);
4706}
4707
4708bool ptlock_alloc(struct page *page)
4709{
4710 spinlock_t *ptl;
4711
4712 ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
4713 if (!ptl)
4714 return false;
4715 page->ptl = ptl;
4716 return true;
4717}
4718
4719void ptlock_free(struct page *page)
4720{
4721 kmem_cache_free(page_ptl_cachep, page->ptl);
4722}
4723#endif
4724